This R script is used to validate the points in the ReSurvey database using RS indicators (NDVI, NDMI, canopy height).

Load libraries

library(tidyverse)
library(here)
library(gridExtra)
library(readxl)
library(scales)
library(sf)
library(rnaturalearth)
library(dtplyr)
library(lme4)
library(lmerTest)
library(car)
library(ggeffects)
library(party)
library(partykit)
library(strucchange)
library(ggparty)
library(caret)
library(moreparty)
library(randomForest)

Define printall function

printall <- function(tibble) {
  print(tibble, width = Inf)
  }

Load geom_flat_violin plot

source("https://gist.githubusercontent.com/benmarwick/2a1bb0133ff568cbe28d/raw/fb53bd97121f7f9ce947837ef1a4c65a73bffb3f/geom_flat_violin.R")

Read ReSurvey data with RS indicators

db_resurv_RS<-read_tsv(
  here("data", "clean", "db_resurv_RS_20250505.csv"),
  col_types = cols(
    # Dynamically specify EUNIS columns as character
    .default = col_guess(),  # Default guessing for other columns
    EUNISa = col_character(),
    EUNISb = col_character(),
    EUNISc = col_character(),
    EUNISd = col_character(),
    EUNISa_1 = col_character(),
    EUNISa_2 = col_character(),
    EUNISa_3 = col_character(),
    EUNISa_4 = col_character(),
    EUNISb_1 = col_character(),
    EUNISb_2 = col_character(),
    EUNISb_3 = col_character(),
    EUNISb_4 = col_character(),
    EUNISc_1 = col_character(),
    EUNISc_2 = col_character(),
    EUNISc_3 = col_character(),
    EUNISc_4 = col_character(),
    EUNISd_1 = col_character(),
    EUNISd_2 = col_character(),
    EUNISd_3 = col_character(),
    EUNISd_4 = col_character(),
    EUNISa_1_descr = col_character(),
    EUNISb_1_descr = col_character(),
    EUNISc_1_descr = col_character(),
    EUNISd_1_descr = col_character(),
    EUNIS_assignation = col_character(),
    EUNISa_2_descr = col_character(),
    EUNISa_3_descr = col_character(),
    EUNISa_4_descr = col_character(),
    EUNISb_2_descr = col_character(),
    EUNISb_3_descr = col_character(),
    EUNISb_4_descr = col_character(),
    EUNISc_2_descr = col_character(),
    EUNISc_3_descr = col_character(),
    EUNISc_4_descr = col_character(),
    EUNISd_2_descr = col_character(),
    EUNISd_3_descr = col_character(),
    EUNISd_4_descr = col_character()
    )
  )

No parsing issues!

Some data managenemt

Several EUNIS level 1 assigned

Number of rows where there is more than one EUNIS 1 assigned, and they are different among them. See what to do with these later! So far I take EUNISa_1.

nrow(db_resurv_RS %>% 
       # Rows with more than one EUNIS 1 assigned
       filter(!is.na(EUNISb_1)) %>% 
       filter(EUNISa_1!=EUNISb_1 | EUNISb_1 != EUNISc_1 | EUNISa_1 != EUNISc_1))
[1] 102

See “confusions”:

db_resurv_RS %>% 
  # Rows with more than one EUNIS 1 assigned
  filter(!is.na(EUNISb_1)) %>% 
  filter(EUNISa_1!=EUNISb_1 | EUNISb_1 != EUNISc_1 | EUNISa_1 != EUNISc_1) %>%
  distinct(EUNISa_1, EUNISb_1, EUNISc_1, EUNISd_1)

Define “confusion” columns:

db_resurv_RS <- db_resurv_RS %>%
  mutate(EUNIS1_conf_type = case_when(
    EUNISa_1 == "R" & EUNISb_1 == "S" ~ "R/S",
    EUNISa_1 == "S" & EUNISb_1 == "T" ~ "S/T",
    EUNISa_1 == "R" & EUNISb_1 == "R" & EUNISc_1 == "S" ~ "R/S",
    EUNISa_1 == "R" & EUNISb_1 == "R" & EUNISc_1 == "S" & EUNISd_1 == "S" ~ "R/S",
    EUNISa_1 == "P" & EUNISb_1 == "Q" ~ "P/Q",
    TRUE ~ NA_character_),
    EUNIS1_conf = !is.na(EUNIS1_conf_type))

Tibble with selected columns

db_resurv_RS_short <- db_resurv_RS %>%
  select(PlotObservationID, Country, RS_CODE, `ReSurvey site`, `ReSurvey plot`,
         `Manipulate (y/n)`, `Type of manipulation`, Lon_updated, Lat_updated,
         `Location method`, `Location uncertainty (m)`, EUNISa_1,
         EUNISa_1_descr, EUNISa_2, EUNISa_2_descr, EUNISa_3, EUNISa_3_descr,
         EUNISa_4, EUNISa_4_descr, EUNIS1_conf, EUNIS1_conf_type,
         date, year, biogeo, unit, year_RS, Lon_RS, Lat_RS, 
         starts_with("NDVI"), starts_with("NDMI"), starts_with("NDWI"),
         starts_with("EVI"), starts_with("SAVI"), canopy_height,
         SOS_DOY, SOS_date, NDVI_at_SOS, Peak_DOY, Peak_date, NDVI_at_Peak,
         EOS_DOY, EOS_date, NDVI_at_EOS, Season_Length,
         S2_data, Landsat_data, CH_data, S2_phen_data)

TO-DO: Missing data checks

Do when all RS data is ready!

Flag when year is different between RS data and ReSurvey db

db_resurv_RS_short <- db_resurv_RS_short %>%
  mutate(year_diff = year != year_RS)
db_resurv_RS_short %>% count(year_diff)

2 with different year. RS indices would need to be calculated again for those.

Flag when coordinates are different between RS data and ReSurvey db

db_resurv_RS_short <- db_resurv_RS_short %>%
  mutate(Lon_diff = case_when(Lon_updated == Lon_RS ~ "NO",
                              # Sometimes they are only slighly different
                              abs(Lon_updated - Lon_RS) < 0.01 ~ "SMALL",
                              is.na(Lon_updated) | is.na(Lon_RS) ~ NA,
                              TRUE ~ "LARGE"),
         Lat_diff = case_when(Lat_updated == Lat_RS ~ "NO",
                              # Sometimes they are only slighly different
                              abs(Lat_updated - Lat_RS) < 0.01 ~ "SMALL",
                              is.na(Lat_updated) | is.na(Lat_RS) ~ NA,
                              TRUE ~ "LARGE"))
db_resurv_RS_short %>% count(Lon_diff)
db_resurv_RS_short %>% count(Lat_diff)

Very few with large differences (2 for longitude, 6 for latitude). RS indices would need to be calculated again for those.

If year_diff is TRUE or Lon_diff is LARGE or Lat_diff is LARGE –> RS indices need to be recalculated. So far, remove those rows from db_resurv_RS_short.

db_resurv_RS_short <- db_resurv_RS_short %>%
  filter(year_diff == FALSE | is.na(year_diff)) %>%
  filter(Lon_diff == "NO" |Lon_diff == "SMALL" | is.na(Lon_diff)) %>%
  filter(Lat_diff == "NO" |Lat_diff == "SMALL" | is.na(Lat_diff))

TO-DO: Recalculate RS indices for those above

Handle plots that have more than one obs per year

Add column PLOT to data to identify unique plots:

db_resurv_RS_short_PLOT <- db_resurv_RS_short %>%
  # Original names give problems, create new vars
  mutate(RS_site = `ReSurvey site`, RS_plot = `ReSurvey plot`) %>%
  # Convert to data.table for faster processing
  lazy_dt() %>%
  # Group by the 3 vars that uniquely identify each pl ot
  group_by(RS_CODE, RS_site, RS_plot) %>%
  # Create a new variable PLOT for each group
  mutate(PLOT = .GRP) %>%
  # Convert back to tibble
  as_tibble() %>%
  # Remove unneeded vars
  select(-RS_site, -RS_plot)

There should be only one observation of each plot per year.

Plots where there is at least a year with more than one observation, and where those observations have a different EUNIS assigned:

plots_to_remove <- db_resurv_RS_short_PLOT %>%
  group_by(PLOT, year) %>%
  summarize(EUNISa_1_n = n_distinct(EUNISa_1, na.rm = TRUE)) %>%
  ungroup() %>% 
  filter(EUNISa_1_n > 1) %>%
  distinct(PLOT)
`summarise()` has grouped output by 'PLOT'. You can override using the `.groups` argument.

Remove plots_to_remove from the database:

db_resurv_RS_short_PLOT <- db_resurv_RS_short_PLOT %>%
  anti_join(plots_to_remove, by = "PLOT")

Plots and years where there is more than one observation:

plots_to_merge <- db_resurv_RS_short_PLOT %>%
  group_by(PLOT, year) %>%
  # Plots that have more than one observation per year
  filter(n() > 1) %>%
  ungroup() %>%
  distinct(PLOT)

Summarize plots_to_merge:

plots_to_merge_summ <- db_resurv_RS_short_PLOT %>%
  group_by(PLOT, year) %>%
  # Plots that have more than one observation per year
  filter(n() > 1) %>%
  mutate(obs_num = row_number()) %>%
  pivot_wider(
    names_from = obs_num,
    values_from = c(date, PlotObservationID),
    names_prefix = "obs_"
  ) %>%
  arrange(PLOT) %>%
  summarize(
    across(c(Country, RS_CODE, `ReSurvey site`, `ReSurvey plot`,
             `Manipulate (y/n)`, `Type of manipulation`, Lon_updated,
             Lat_updated, `Location method`, `Location uncertainty (m)`,
             EUNISa_1, EUNISa_1_descr, EUNISa_2, EUNISa_2_descr, EUNISa_3,
             EUNISa_3_descr, EUNISa_4, EUNISa_4_descr, EUNIS1_conf,
             EUNIS1_conf_type, biogeo, unit, year_RS, Lon_RS, Lat_RS, NDVI_max,
             NDVI_median, NDVI_min, NDVI_mode, NDVI_p10, NDVI_p90, NDMI_max,
             NDMI_median, NDMI_min, NDMI_mode, NDMI_p10, NDMI_p90, NDWI_max,
             NDWI_median, NDWI_min, NDWI_mode, NDWI_p10, NDWI_p90, EVI_max,
             EVI_median, EVI_min, EVI_mode, EVI_p10, EVI_p90, SAVI_max,
             SAVI_median, SAVI_min, SAVI_mode, SAVI_p10, SAVI_p90,
             canopy_height, SOS_DOY, SOS_date, Peak_DOY, Peak_date, EOS_DOY,
             EOS_date, S2_data, Landsat_data, CH_data, S2_phen_data, year_diff,
             Lon_diff, Lat_diff), first),
    across(starts_with("date_obs_"), min),
    across(starts_with("PlotObservationID_obs_"), min)
    ) %>%
  ungroup()
`summarise()` has grouped output by 'PLOT'. You can override using the `.groups` argument.

Remove plots_to_merge from the database:

db_resurv_RS_short_PLOT <- db_resurv_RS_short_PLOT %>%
  anti_join(plots_to_merge, by = "PLOT")

And add plots_to_merge_summ, where each plot and year only has one row:

db_resurv_RS_short_PLOT <- bind_rows(db_resurv_RS_short_PLOT,
                                     plots_to_merge_summ)

Check that there is only one row per plot and per year:

db_resurv_RS_short_PLOT %>%
  group_by(PLOT, year) %>%
  # Plots that have more than one observation per year
  filter(n() > 1) 

So, to sum up what I have done:

  • Plots where there is at least a year with more than one observation, and where those observations have a different EUNIS assigned: Plots REMOVED from the data
  • Plots where there is more than one observation, but observations have the same EUNIS assigned: kept in the data. Merged so that there is only one row per year. Info about the different dates (when different) is kept in columns date_obs_1 - date_obs_40, and info about the different PlotObservationID is kept in the columns PlotObservationID_obs_1 - PlotObservationID_obs_40.

Save to clean data

Save clean file for analyses (to be updated continuously due to updates in ReSurvey database and updates on RS data).

write_tsv(db_resurv_RS_short_PLOT,
          here("data", "clean","db_resurv_RS_short_PLOT_20250505.csv"))

Distributions all bioregions

# Define a function to create histograms
plot_histogram <- function(data, x_var, x_label) {
  ggplot(data %>%
           filter(EUNISa_1 %in% c("T", "R", "S", "Q")),
         aes(x = !!sym(x_var))) +
    geom_histogram(color = "black", fill = "white") +
    labs(x = x_label, y = "Frequency") +
    theme_bw()
}
# Define a function to create plots with violin + boxplot + points
distr_plot <- function(data, y_vars, y_labels) {
  for (i in seq_along(y_vars)) {
    y_var <- y_vars[[i]]
    y_label <- y_labels[[i]]
    
    p <- ggplot(data = data %>%
                  filter(EUNISa_1 %in% c("T", "R", "S", "Q")),
                aes(x = EUNISa_1_descr, y = !!sym(y_var), fill = EUNISa_1_descr)) +
      geom_flat_violin(position = position_nudge(x = 0.2, y = 0), alpha = 0.8) +
      geom_point(aes(y = !!sym(y_var), color = EUNISa_1_descr),
                 position = position_jitter(width = 0.15), size = 1, alpha = 0.25) +
      geom_boxplot(width = 0.2, outlier.shape = NA, alpha = 0.5) +
      stat_summary(fun.y = mean, geom = "point", shape = 20, size = 1) +
      stat_summary(fun.data = function(x) data.frame(y = max(x) + 0.1,
                                                     label = length(x)),
                   geom = "text", aes(label = ..label..), vjust = 0.5) +
      labs(y = y_label, x = "EUNIS level 1") +
      scale_x_discrete(labels = function(x) str_wrap(x, width = 15)) +
      guides(fill = FALSE, color = FALSE) +
      theme_bw() + coord_flip()
    
    print(p)
  }
}

HERE: Metrics calculated only for July, not for all year!

NDVI, NDMI, NDWI, SAVI and EVI

Histograms to check that values are ok:

hist_NDVI <- plot_histogram(db_resurv_RS_short_PLOT, "NDVI_max", "NDVI max")
hist_NDMI <- plot_histogram(db_resurv_RS_short_PLOT, "NDMI_max", "NDMI max")
hist_NDWI <- plot_histogram(db_resurv_RS_short_PLOT, "NDWI_max", "NDWI max")
hist_SAVI <- plot_histogram(db_resurv_RS_short_PLOT, "SAVI_max", "SAVI max")
hist_EVI <- plot_histogram(db_resurv_RS_short_PLOT, "EVI_max", "EVI max")
hist_NDVI

hist_NDMI

hist_NDWI

hist_SAVI

hist_EVI # Some values wrong!

hist_EVI_1 <- plot_histogram(db_resurv_RS_short_PLOT %>% filter(EVI_max <= 1),
                             "EVI_max", "EVI max")
hist_EVI_1

nrow(db_resurv_RS_short_PLOT %>%
       filter(EUNISa_1 %in% c("T", "R", "S", "Q")) %>%
       filter(EVI_max > 1))
[1] 1130
db_resurv_RS_short_PLOT %>%
       filter(EUNISa_1 %in% c("T", "R", "S", "Q"))%>%
  filter(EVI_max > 1) %>%
  count(biogeo, unit)

So far, remove observations with EVI_max > 1 from stuff using EVI values.

Distribution plots:

plot_distr_NDVI <- distr_plot(db_resurv_RS_short_PLOT, "NDVI_max", "NDVI max")
plot_distr_NDMI <- distr_plot(db_resurv_RS_short_PLOT, "NDMI_max", "NDMI max")
plot_distr_NDWI <- distr_plot(db_resurv_RS_short_PLOT, "NDWI_max", "NDWI max")
plot_distr_SAVI <- distr_plot(db_resurv_RS_short_PLOT, "SAVI_max", "SAVI max")
plot_distr_EVI <- distr_plot(db_resurv_RS_short_PLOT %>% filter(EVI_max <= 1),
                             "EVI_max", "EVI max")
plot_distr_NDVI

plot_distr_NDMI

plot_distr_NDWI

plot_distr_SAVI

plot_distr_EVI

CH

plot_distr_CH <- distr_plot(db_resurv_RS_short_PLOT, "canopy_height",
                            "Canopy height (m)")
plot_distr_CH

Show habitats with CH categories

ggplot(db_resurv_RS_short_PLOT %>%
         # Keep only forests, grasslands, shrublands and wetlands
         filter(EUNISa_1 %in% c("T", "R", "S", "Q")) %>%
         mutate(CH_cat =
                  factor(
                    case_when(canopy_height == 0 ~ "0 m",
                              canopy_height > 0 & canopy_height <= 1 ~ "0-1 m",
                              canopy_height > 1 & canopy_height <=2 ~ "1-2 m",
                              canopy_height > 2 & canopy_height <=5 ~ "2-5 m",
                              canopy_height > 5 & canopy_height <=8 ~ "5-8 m",
                              canopy_height > 8 ~ "> 8 m",
                              is.na(canopy_height) ~ NA_character_),
                    levels = c(
                      "0 m", "0-1 m", "1-2 m", "2-5 m", "5-8 m", "> 8 m"))),
       aes(x = EUNISa_1_descr, fill = CH_cat)) +
  geom_bar() + theme_bw() + coord_flip() +
  scale_y_continuous(labels = label_number()) +
  scale_fill_viridis_d(direction = -1) +
  labs(x = "EUNIS level 1", fill = "Canopy height") +
  scale_x_discrete(labels = function(x) str_wrap(x, width = 15)) +
  theme(legend.position = c(0.8, 0.75),
        legend.direction = "vertical")

Stats per habitat type

db_resurv_RS_short_PLOT %>%
  # Keep only forests, grasslands, shrublands and wetlands
  filter(EUNISa_1 %in% c("T", "R", "S", "Q")) %>%
  group_by(EUNISa_1_descr) %>%
  summarise(across(canopy_height, list(
    mean = mean,
    median = median,
    sd = sd,
    min = min,
    max = max
    ), na.rm = TRUE))

Phenology

Calculate metrics

db_resurv_RS_short_PLOT <- db_resurv_RS_short_PLOT %>%
  mutate(
    # Difference NDVI between Peak and SOS
    diff_Peak_SOS = NDVI_at_Peak - NDVI_at_SOS,
    # Difference NDVI between Peak and EOS
    diff_Peak_EOS = NDVI_at_Peak - NDVI_at_EOS)

Histograms phenology measures

ggplot(data = db_resurv_RS_short_PLOT %>%
         # Keep only forests, grasslands, shrublands and wetlands
         filter(EUNISa_1 %in% c("T", "R", "S", "Q") & S2_phen_data == T) %>%
         pivot_longer(cols = c(SOS_DOY, Peak_DOY, EOS_DOY), names_to = "name",
                      values_to = "value"),
       aes(x = value)) +
  geom_histogram(fill = "white", color = "black") +
  facet_grid(biogeo ~ name, scales = "free_y") +
  theme_bw()

ggplot(data = db_resurv_RS_short_PLOT %>%
         # Keep only forests, grasslands, shrublands and wetlands
         filter(EUNISa_1 %in% c("T", "R", "S", "Q") & S2_phen_data == T) %>%
         pivot_longer(cols = c(NDVI_at_SOS, NDVI_at_Peak, NDVI_at_EOS),
                      names_to = "name", values_to = "value"),
       aes(x = value)) +
  geom_histogram(fill = "white", color = "black") +
  facet_grid(biogeo ~ name, scales = "free_y") +
  theme_bw()

ggplot(data = db_resurv_RS_short_PLOT %>%
         # Keep only forests, grasslands, shrublands and wetlands
         filter(EUNISa_1 %in% c("T", "R", "S", "Q") & S2_phen_data == T) %>%
         pivot_longer(cols = c(diff_Peak_SOS, diff_Peak_EOS),
                      names_to = "name", values_to = "value"),
       aes(x = value)) +
  geom_histogram(fill = "white", color = "black") +
  facet_grid(biogeo ~ name, scales = "free_y") +
  theme_bw()

ggplot(data = db_resurv_RS_short_PLOT %>%
         # Keep only forests, grasslands, shrublands and wetlands
         filter(EUNISa_1 %in% c("T", "R", "S", "Q") & S2_phen_data == T) %>%
         pivot_longer(cols = c(Season_Length),
                      names_to = "name", values_to = "value"),
       aes(x = value)) +
  geom_histogram(fill = "white", color = "black") +
  facet_grid(biogeo ~ name, scales = "free_y") +
  theme_bw()

Distributions

plot_distr_SOS_DOY <- distr_plot(db_resurv_RS_short_PLOT, "SOS_DOY",
                                 "SOS DOY")
plot_distr_Peak_DOY <- distr_plot(db_resurv_RS_short_PLOT, "Peak_DOY",
                                  "Peak DOY")
plot_distr_EOS_DOY <- distr_plot(db_resurv_RS_short_PLOT, "EOS_DOY",
                                 "EOS DOY")
plot_distr_NDVI_at_SOS <- distr_plot(db_resurv_RS_short_PLOT, "NDVI_at_SOS",
                                     "NDVI at SOS")
plot_distr_NDVI_at_Peak <- distr_plot(db_resurv_RS_short_PLOT, "NDVI_at_Peak",
                                      "NDVI at Peak")
plot_distr_NDVI_at_EOS <- distr_plot(db_resurv_RS_short_PLOT, "NDVI_at_EOS",
                                     "NDVI at EOS")
plot_distr_diff_Peak_SOS <- distr_plot(db_resurv_RS_short_PLOT, "diff_Peak_SOS",
                                       "Difference Peak-SOS")
plot_distr_diff_Peak_EOS <- distr_plot(db_resurv_RS_short_PLOT, "diff_Peak_EOS",
                                       "Difference Peak-EOS")
plot_distr_Season_Length <- distr_plot(db_resurv_RS_short_PLOT, "Season_Length",
                                       "Season Length")
plot_distr_SOS_DOY

plot_distr_Peak_DOY

plot_distr_EOS_DOY

plot_distr_NDVI_at_SOS

plot_distr_NDVI_at_Peak

plot_distr_NDVI_at_EOS

plot_distr_diff_Peak_SOS

plot_distr_diff_Peak_EOS

plot_distr_Season_Length

Distributions per bioregion

# Define a function to create plots with violin + boxplot + points
distr_plot_biogeo <- function(data, y_var, y_label) {
  ggplot(data = data %>%
           filter(EUNISa_1 %in% c("T", "R", "S", "Q")),
         aes(x = EUNISa_1_descr, y = !!sym(y_var), fill = EUNISa_1_descr)) +
    geom_flat_violin(position = position_nudge(x = 0.2, y = 0), alpha = 0.8) +
    geom_point(aes(y = !!sym(y_var), color = EUNISa_1_descr),
               position = position_jitter(width = 0.15), size = 1, alpha = 0.25) +
    geom_boxplot(width = 0.2, outlier.shape = NA, alpha = 0.5) +
    stat_summary(fun.y = mean, geom = "point", shape = 20, size = 1) +
    stat_summary(fun.data = function(x) data.frame(y = max(x) + 0.1,
                                                   label = length(x)),
                 geom = "text", aes(label = ..label..), vjust = 0.5) +
    labs(y = y_label, x = "EUNISa_1_descr") +
    scale_x_discrete(labels = function(x) str_wrap(x, width = 15)) +
    guides(fill = FALSE, color = FALSE) +
    theme_bw() + coord_flip() + facet_wrap(~ biogeo)
}

Max. NDVI, NDMI, NDWI, SAVI and EVI

Distribution plots:

plot_distr_NDVI_biogeo <- distr_plot_biogeo(db_resurv_RS_short_PLOT %>%
                                              filter(!is.na(biogeo)),
                                            "NDVI_max", "NDVI max")
plot_distr_NDMI_biogeo <- distr_plot_biogeo(db_resurv_RS_short_PLOT %>%
                                              filter(!is.na(biogeo)),
                                            "NDMI_max", "NDMI max")
plot_distr_NDWI_biogeo <- distr_plot_biogeo(db_resurv_RS_short_PLOT %>%
                                              filter(!is.na(biogeo)),
                                            "NDWI_max", "NDWI max")
plot_distr_SAVI_biogeo <- distr_plot_biogeo(db_resurv_RS_short_PLOT %>%
                                              filter(!is.na(biogeo)),
                                            "SAVI_max", "SAVI max")
plot_distr_EVI_biogeo <- distr_plot_biogeo(db_resurv_RS_short_PLOT %>%
                                              filter(!is.na(biogeo)) %>%
                                             filter(EVI_max <= 1),
                                           "EVI_max", "EVI max")
plot_distr_NDVI_biogeo

plot_distr_NDMI_biogeo

plot_distr_NDWI_biogeo

plot_distr_SAVI_biogeo

plot_distr_EVI_biogeo

BOR missing because there is no EUNIS info for those that have RS info.

CH

plot_distr_CH_biogeo <- distr_plot_biogeo(db_resurv_RS_short_PLOT,
                                          "canopy_height", "Canopy height (m)")
plot_distr_CH_biogeo

In this plot, those with biogeo = NA are those that do not have S2 or Landsat data (and thus biogeo has not been assigned), but have CH data. We should later assign a biogeo based on location.

Phenology

plot_distr_SOS_DOY_biogeo <- distr_plot_biogeo(db_resurv_RS_short_PLOT %>%
                                              filter(!is.na(biogeo)),
                                               "SOS_DOY", "SOS DOY")
plot_distr_Peak_DOY_biogeo <- distr_plot_biogeo(db_resurv_RS_short_PLOT %>%
                                              filter(!is.na(biogeo)),
                                                "Peak_DOY", "Peak DOY")
plot_distr_EOS_DOY_biogeo <- distr_plot_biogeo(db_resurv_RS_short_PLOT %>%
                                              filter(!is.na(biogeo)),
                                               "EOS_DOY", "EOS DOY")
plot_distr_NDVI_at_SOS_biogeo <- distr_plot_biogeo(db_resurv_RS_short_PLOT %>%
                                              filter(!is.na(biogeo)),
                                               "NDVI_at_SOS", "NDVI at SOS")
plot_distr_NDVI_at_Peak_biogeo <- distr_plot_biogeo(db_resurv_RS_short_PLOT %>%
                                              filter(!is.na(biogeo)),
                                                "NDVI_at_Peak", "NDVI at Peak")
plot_distr_NDVI_at_EOS_biogeo <- distr_plot_biogeo(db_resurv_RS_short_PLOT %>%
                                              filter(!is.na(biogeo)),
                                               "NDVI_at_EOS", "NDVI at EOS")
plot_distr_diff_Peak_SOS_biogeo <- distr_plot_biogeo(db_resurv_RS_short_PLOT %>%
                                              filter(!is.na(biogeo)),
                                               "diff_Peak_SOS", "Difference Peak-SOS")
plot_distr_diff_Peak_EOS_biogeo <- distr_plot_biogeo(db_resurv_RS_short_PLOT %>%
                                              filter(!is.na(biogeo)),
                                               "diff_Peak_EOS", "Difference Peak-EOS")
plot_distr_Season_Length_biogeo <- distr_plot_biogeo(db_resurv_RS_short_PLOT %>%
                                              filter(!is.na(biogeo)),
                                               "Season_Length", "Season Length")

plot_distr_SOS_DOY_biogeo

plot_distr_Peak_DOY_biogeo

plot_distr_EOS_DOY_biogeo

plot_distr_NDVI_at_SOS_biogeo

plot_distr_NDVI_at_Peak_biogeo

plot_distr_NDVI_at_EOS_biogeo

plot_distr_diff_Peak_SOS_biogeo

plot_distr_diff_Peak_EOS_biogeo

plot_distr_Season_Length_biogeo

BOR missing because there is no EUNIS info for those that have RS info.

HERE: Verify SOS-Peak_EOS ODY

ERRORS! Double-check and send to Bea:

db_resurv_RS_short_PLOT %>% filter(SOS_DOY > Peak_DOY)
db_resurv_RS_short_PLOT %>% filter(Peak_DOY > EOS_DOY)
db_resurv_RS_short_PLOT %>% filter(SOS_DOY > EOS_DOY)
db_resurv_RS_short_PLOT %>% filter(NDVI_at_Peak < NDVI_at_SOS)
db_resurv_RS_short_PLOT %>% filter(NDVI_at_Peak < NDVI_at_EOS)

Comparison differential GPS vs. other points

# Define function
distr_plot_GPS <- function(data, y_var, y_label) {
  ggplot(data = data %>%
           filter(EUNISa_1 %in% c("T", "R", "S", "Q")) %>%
           mutate(GPS = 
                    ifelse(
                      is.na(`Location method`) |
                        `Location method` != "Location with differential GPS",
                      "Other",
                      "Differential GPS")),
         aes(x = EUNISa_1_descr, y = !!sym(y_var), fill = EUNISa_1_descr)) +
    geom_flat_violin(position = position_nudge(x = 0.2, y = 0), alpha = 0.8) +
    geom_point(aes(y = !!sym(y_var), color = EUNISa_1_descr),
               position = position_jitter(width = 0.15), size = 1, alpha = 0.25) +
    geom_boxplot(width = 0.2, outlier.shape = NA, alpha = 0.5) +
    stat_summary(fun.y = mean, geom = "point", shape = 20, size = 1) +
    stat_summary(fun.data = function(x) data.frame(y = max(x) + 0.1,
                                                   label = length(x)),
                 geom = "text", aes(label = ..label..), vjust = 0.5) +
    labs(y = y_label, x = "EUNIS level 1") +
    scale_x_discrete(labels = function(x) str_wrap(x, width = 15)) +
    guides(fill = FALSE, color = FALSE) +
    theme_bw() + coord_flip() + facet_wrap(~ GPS)
}
plot_distr_NDVI_GPS <- distr_plot_GPS(db_resurv_RS_short_PLOT,
                                      "NDVI_max", "NDVI max")
plot_distr_NDMI_GPS <- distr_plot_GPS(db_resurv_RS_short_PLOT,
                                      "NDMI_max", "NDMI max")
plot_distr_NDWI_GPS <- distr_plot_GPS(db_resurv_RS_short_PLOT,
                                      "NDWI_max", "NDWI max")
plot_distr_SAVI_GPS <- distr_plot_GPS(db_resurv_RS_short_PLOT,
                                      "SAVI_max", "SAVI max")
plot_distr_EVI_GPS <- distr_plot_GPS(db_resurv_RS_short_PLOT %>%
                                       filter(EVI_max <= 1),
                                      "EVI_max", "EVI max")
plot_distr_NDVI_GPS

plot_distr_NDMI_GPS

plot_distr_NDWI_GPS

plot_distr_SAVI_GPS

plot_distr_EVI_GPS

Including PLOT (USE LATER?)

Summarize variables by plot:

db_resurv_RS_short_PLOT_summ <- db_resurv_RS_short_PLOT %>%
  filter(EUNISa_1 %in% c("T", "R", "S", "Q")) %>%
  filter(S2_data == T | Landsat_data == T) %>%
  group_by(PLOT) %>%
  summarize(EUNIS1 = if_else(n_distinct(EUNISa_1) > 1, "Change",
                             unique(EUNISa_1)[1]),
            count = n(),
            across(starts_with("NDVI"), list(mean = mean, sd = sd),
                   .names = "{col}_{fn}"))

Maybe use later because now many plots have only one observation, probably because some Landsat data is missing?

First validation

For T, R, S, Q habitats.

Define a set of rules for a first validation of ALL ReSurvey data. We can call these “Expert-based” rules.

Number of observations in ReSurvey from the habitats of interest:

nrow(db_resurv_RS_short_PLOT %>%
       filter(EUNISa_1 %in% c("T", "R", "S", "Q")))
[1] 181061

Number of observations in ReSurvey from the habitats of interest and with all RS data:

nrow(db_resurv_RS_short_PLOT %>%
       filter(EUNISa_1 %in% c("T", "R", "S", "Q")) %>%
       filter(CH_data == T) %>%
       filter(S2_data == T | Landsat_data ==T) %>%
       filter(S2_phen_data == T))
[1] 12219
db_resurv_RS_short_PLOT_terrestrial <- db_resurv_RS_short_PLOT %>%
  filter(EUNISa_1 %in% c("T", "R", "S", "Q"))

Define rules

Create column for first validation based on different indicators, where “wrong” is noted when the validation rule is not met. Include EUNIS1 confusions.

db_resurv_RS_short_PLOT_terrestrial %>% count(EUNISa_1, EUNIS1_conf_type)

Define rules:

db_resurv_RS_short_PLOT_terrestrial <-
  db_resurv_RS_short_PLOT_terrestrial %>%
  mutate(
    valid_1_NDWI = case_when(
      # Points that are basically water
      NDWI_max > 0.3 ~ "wrong",
      TRUE ~ NA_character_),
    valid_1_CH = case_when(
      # T points with low CH
      EUNISa_1 == "T" & canopy_height < 8 ~ "wrong",
      # S points with low CH
      EUNISa_1 =="S" & canopy_height < 5 ~ "wrong",
      # R & Q points with high CH
      EUNISa_1 %in% c("R", "Q") & canopy_height > 2 ~ "wrong",
      TRUE ~ NA_character_),
    valid_1_NDVI = case_when(
      # T points with low NDVI_max
      EUNISa_1 == "T" & NDVI_max < 0.6 ~ "wrong",
      # S-R-Q points with low NDVI_max
      EUNISa_1 %in% c("R", "S", "Q") & NDVI_max < 0.2 ~ "wrong",
      TRUE ~ NA_character_),
    # Count how many validation rules are not met
    valid_1_count = rowSums(across(c(valid_1_NDWI, valid_1_CH, valid_1_NDVI), 
                             ~ . == "wrong"), na.rm = TRUE),
    # Points where at least 1 rule not met
    valid_1 = if_else(valid_1_count > 0, "At least 1 rule broken",
                      "No rules broken so far")
    )

Plots first validation

ggplot(db_resurv_RS_short_PLOT_terrestrial%>%
         mutate(rules_broken = case_when(
           valid_1_count == 1 & valid_1_NDWI == "wrong" ~ "NDWI",
           valid_1_count == 1 & valid_1_NDVI == "wrong" ~ "NDVI",
           valid_1_count == 1 & valid_1_CH == "wrong" ~ "CH",
           valid_1_count == 2 &
             valid_1_NDWI == "wrong" & valid_1_NDVI == "wrong"~ "NDWI + NDVI",
           valid_1_count == 2 &
             valid_1_NDWI == "wrong" & valid_1_CH == "wrong"~ "NDWI + CH",
           valid_1_count == 2 &
             valid_1_NDVI == "wrong" & valid_1_CH == "wrong"~ "NDVI + CH",
           valid_1_count == 3 ~ "NDWI + NDVI + CH",
           TRUE ~ NA_character_
         )), 
       aes(x = valid_1_count, fill = rules_broken)) +
  geom_bar() + labs(x = "Number of broken rules")

db_resurv_RS_short_PLOT_terrestrial %>%
         mutate(rules_broken = case_when(
           valid_1_count == 1 & valid_1_NDWI == "wrong" ~ "NDWI",
           valid_1_count == 1 & valid_1_NDVI == "wrong" ~ "NDVI",
           valid_1_count == 1 & valid_1_CH == "wrong" ~ "CH",
           valid_1_count == 2 &
             valid_1_NDWI == "wrong" & valid_1_NDVI == "wrong"~ "NDWI + NDVI",
           valid_1_count == 2 &
             valid_1_NDWI == "wrong" & valid_1_CH == "wrong"~ "NDWI + CH",
           valid_1_count == 2 &
             valid_1_NDVI == "wrong" & valid_1_CH == "wrong"~ "NDVI + CH",
           valid_1_count == 3 ~ "NDWI + NDVI + CH",
           TRUE ~ NA_character_
         )) %>%
  count(rules_broken, EUNIS1_conf_type)

Proportion of observations not validated (so far):

nrow(db_resurv_RS_short_PLOT_terrestrial %>% filter(valid_1_count > 0))/
  nrow(db_resurv_RS_short_PLOT_terrestrial)
[1] 0.1941721

But be aware that there are still many missing RS data, e.g. Landsat data for bioregion CON (many points).

ggplot(db_resurv_RS_short_PLOT_terrestrial %>%
         mutate(diff_GPS = if_else(
           `Location method` != "Location with differential GPS" |
             is.na(`Location method`), "no", "yes")), 
       aes(x = diff_GPS, fill = valid_1)) +
  geom_bar() + labs(x = "Differential GPS")

ggplot(db_resurv_RS_short_PLOT_terrestrial %>%
         mutate(GPS = case_when(
           `Location method` == "Location with differential GPS" ~ "yes",
           `Location method` == "Location with GPS" ~ "yes",
           is.na(`Location method`) ~ "no",
           TRUE ~ "no"
         )), 
       aes(x = GPS, fill = valid_1)) +
  geom_bar() + labs(x = "GPS")

Points with any rule broken and confusion between EUNIS:

nrow(db_resurv_RS_short_PLOT_terrestrial %>%
       filter(EUNIS1_conf == T & valid_1_count > 0))
[1] 9

Convert to shp to look at these in GIS:

# st_write(db_resurv_RS_short_PLOT_terrestrial %>%
#            filter(EUNIS1_conf == T & valid_1_count > 0) %>%
#            st_as_sf(coords = c("Lon_updated", "Lat_updated"), crs = 4326),
#          "C:/GIS/MOTIVATE/shapefiles/resurv_not_val_EUNIS_conf.shp")

Checked and yes

How many points with differential GPS that have at least 1 rule broken?

nrow(db_resurv_RS_short_PLOT_terrestrial %>%
  filter(`Location method` == "Location with differential GPS" &
           valid_1 == "At least 1 rule broken"))
[1] 1202

Convert to shp to look at these in GIS:

# st_write(db_resurv_RS_short_PLOT_terrestrial %>%
#            filter(`Location method` == "Location with differential GPS" &
#                     valid_1 == "At least 1 rule broken") %>%
#            st_as_sf(coords = c("Lon_updated", "Lat_updated"), crs = 4326),
#          "C:/GIS/MOTIVATE/shapefiles/resurv_not_val_diff_GPS.shp")

Distributions from diff GPS points without rules broken so far

Create tibble with differential GPS points without rules broken so far:

diff_GPS_valid <- db_resurv_RS_short_PLOT_terrestrial %>%
  filter(`Location method` == "Location with differential GPS" &
           valid_1 == "No rules broken so far") 

Max. and min. NDVI, NDMI, NDWI, SAVI and EVI

plot_distr_NDVI_max_diff_GPS_valid <- distr_plot(diff_GPS_valid,
                                             "NDVI_max", "NDVI max")
plot_distr_NDMI_max_diff_GPS_valid <- distr_plot(diff_GPS_valid,
                                             "NDMI_max", "NDMI max")
plot_distr_NDWI_max_diff_GPS_valid <- distr_plot(diff_GPS_valid,
                                             "NDWI_max", "NDWI max")
plot_distr_SAVI_max_diff_GPS_valid <- distr_plot(diff_GPS_valid,
                                             "SAVI_max", "SAVI max")
plot_distr_EVI_max_diff_GPS_valid <- distr_plot(diff_GPS_valid %>%
                                              filter(EVI_max <= 1),
                                            "EVI_max", "EVI max")
plot_distr_NDVI_max_diff_GPS_valid

plot_distr_NDMI_max_diff_GPS_valid

plot_distr_NDWI_max_diff_GPS_valid

plot_distr_SAVI_max_diff_GPS_valid

plot_distr_EVI_max_diff_GPS_valid

plot_distr_NDVI_min_diff_GPS_valid <- distr_plot(diff_GPS_valid,
                                             "NDVI_min", "NDVI min")
plot_distr_NDMI_min_diff_GPS_valid <- distr_plot(diff_GPS_valid,
                                             "NDMI_min", "NDMI min")
plot_distr_NDWI_min_diff_GPS_valid <- distr_plot(diff_GPS_valid,
                                             "NDWI_min", "NDWI min")
plot_distr_SAVI_min_diff_GPS_valid <- distr_plot(diff_GPS_valid,
                                             "SAVI_min", "SAVI min")
plot_distr_EVI_min_diff_GPS_valid <- distr_plot(diff_GPS_valid %>%
                                                  # Some values wrong!
                                                  # EVI should not be lower than -1
                                                  filter(EVI_min >= -1),
                                                "EVI_min", "EVI min")
plot_distr_NDVI_min_diff_GPS_valid

plot_distr_NDMI_min_diff_GPS_valid

plot_distr_NDWI_min_diff_GPS_valid

plot_distr_SAVI_min_diff_GPS_valid

plot_distr_EVI_min_diff_GPS_valid

CH

plot_distr_CH_diff_GPS_valid <- distr_plot(diff_GPS_valid, "canopy_height",
                                           "Canopy height (m)")
plot_distr_CH_diff_GPS_valid

Phenology

plot_distr_SOS_DOY_diff_GPS_valid <- distr_plot(diff_GPS_valid,
                                                "SOS_DOY", "SOS DOY")
plot_distr_Peak_DOY_diff_GPS_valid <- distr_plot(diff_GPS_valid,
                                                 "Peak_DOY", "Peak DOY")
plot_distr_EOS_DOY_diff_GPS_valid <- distr_plot(diff_GPS_valid,
                                                "EOS_DOY", "EOS DOY")
plot_distr_SOS_DOY_diff_GPS_valid

plot_distr_Peak_DOY_diff_GPS_valid

plot_distr_EOS_DOY_diff_GPS_valid

Not very conclusive…

Distributions from GPS (diff or not) points without rules broken so far

Create tibble with differential GPS points without rules broken so far:

all_GPS_valid <- db_resurv_RS_short_PLOT_terrestrial %>%
  filter((`Location method` == "Location with differential GPS" | 
            `Location method` == "Location with GPS" ) &
           valid_1 == "No rules broken so far") 

Max. and min. NDVI, NDMI, NDWI, SAVI and EVI

plot_distr_NDVI_max_all_GPS_valid <- distr_plot(all_GPS_valid,
                                             "NDVI_max", "NDVI max")
plot_distr_NDMI_max_all_GPS_valid <- distr_plot(all_GPS_valid,
                                             "NDMI_max", "NDMI max")
plot_distr_NDWI_max_all_GPS_valid <- distr_plot(all_GPS_valid,
                                             "NDWI_max", "NDWI max")
plot_distr_SAVI_max_all_GPS_valid <- distr_plot(all_GPS_valid,
                                             "SAVI_max", "SAVI max")
plot_distr_EVI_max_all_GPS_valid <- distr_plot(all_GPS_valid %>%
                                              filter(EVI_max <= 1),
                                            "EVI_max", "EVI max")
plot_distr_NDVI_max_all_GPS_valid

plot_distr_NDMI_max_all_GPS_valid

plot_distr_NDWI_max_all_GPS_valid

plot_distr_SAVI_max_all_GPS_valid

plot_distr_EVI_max_all_GPS_valid

plot_distr_NDVI_min_all_GPS_valid <- distr_plot(all_GPS_valid,
                                             "NDVI_min", "NDVI min")
plot_distr_NDMI_min_all_GPS_valid <- distr_plot(all_GPS_valid,
                                             "NDMI_min", "NDMI min")
plot_distr_NDWI_min_all_GPS_valid <- distr_plot(all_GPS_valid,
                                             "NDWI_min", "NDWI min")
plot_distr_SAVI_min_all_GPS_valid <- distr_plot(all_GPS_valid,
                                             "SAVI_min", "SAVI min")
plot_distr_EVI_min_all_GPS_valid <- distr_plot(all_GPS_valid %>%
                                                  # Some values wrong!
                                                  # EVI should not be lower than -1
                                                  filter(EVI_min >= -1),
                                                "EVI_min", "EVI min")
plot_distr_NDVI_min_all_GPS_valid

plot_distr_NDMI_min_all_GPS_valid

plot_distr_NDWI_min_all_GPS_valid

plot_distr_SAVI_min_all_GPS_valid

plot_distr_EVI_min_all_GPS_valid

CH

plot_distr_CH_all_GPS_valid <- distr_plot(all_GPS_valid, "canopy_height",
                                           "Canopy height (m)")
plot_distr_CH_all_GPS_valid

Phenology

plot_distr_SOS_DOY_all_GPS_valid <- distr_plot(all_GPS_valid,
                                                "SOS_DOY", "SOS DOY")
plot_distr_Peak_DOY_all_GPS_valid <- distr_plot(all_GPS_valid,
                                                 "Peak_DOY", "Peak DOY")
plot_distr_EOS_DOY_all_GPS_valid <- distr_plot(all_GPS_valid,
                                                "EOS_DOY", "EOS DOY")
plot_distr_SOS_DOY_all_GPS_valid

plot_distr_Peak_DOY_all_GPS_valid

plot_distr_EOS_DOY_all_GPS_valid

Combined plot

grid.arrange(
  plot_distr_NDVI_max_diff_GPS_valid, plot_distr_NDVI_max_all_GPS_valid,
  plot_distr_NDMI_min_diff_GPS_valid, plot_distr_NDMI_min_all_GPS_valid,
  ncol = 2)

Diff GPS valid points above p20 of NDVI_max and NDMI_min for each habitat

percentiles_diff_GPS <- diff_GPS_valid %>%
  group_by(EUNISa_1) %>%
  summarize(percentile_20_NDVI_max = quantile(NDVI_max, probs = 0.20, na.rm = T),
            percentile_20_NDMI_min = quantile(NDMI_min, probs = 0.20, na.rm = T))

diff_GPS_valid <- diff_GPS_valid %>%
  left_join(percentiles_diff_GPS, by = "EUNISa_1") %>%
  mutate(category_NDVI_max = case_when(
    NDVI_max < percentile_20_NDVI_max ~ "below_20th",
    NDVI_max >= percentile_20_NDVI_max ~ "above_20th"),
  category_NDMI_min = case_when(
    NDMI_min < percentile_20_NDMI_min ~ "below_20th",
    NDMI_min >= percentile_20_NDMI_min ~ "above_20th"))

ggplot(data = diff_GPS_valid,
       aes(x = EUNISa_1_descr, y = NDVI_max)) +
  geom_flat_violin(position = position_nudge(x = 0.2, y = 0), alpha = 0.8,
                   fill = "lightblue") +
  geom_point(aes(color = category_NDVI_max),
             position = position_jitter(width = 0.15), size = 1, alpha = 0.25) +
  geom_boxplot(width = 0.2, outlier.shape = NA, alpha = 0.5) +
  stat_summary(fun.y = mean, geom = "point", shape = 20, size = 1) +
  stat_summary(fun.data = function(x) data.frame(y = max(x) + 0.1, label = length(x)),
               geom = "text", aes(label = ..label..), vjust = 0.5) +
  labs(y = "NDVI max", x = "EUNIS level 1") +
  guides(fill = FALSE, color = FALSE) +
  scale_x_discrete(labels = function(x) str_wrap(x, width = 15)) +
  scale_color_manual(values = c("below_20th" = "grey", "above_20th" = "lightblue")) +
  theme_bw() + coord_flip()


ggplot(data = diff_GPS_valid,
       aes(x = EUNISa_1_descr, y = NDMI_min)) +
  geom_flat_violin(position = position_nudge(x = 0.2, y = 0), alpha = 0.8,
                   fill = "lightblue") +
  geom_point(aes(color = category_NDMI_min),
             position = position_jitter(width = 0.15), size = 1, alpha = 0.25) +
  geom_boxplot(width = 0.2, outlier.shape = NA, alpha = 0.5) +
  stat_summary(fun.y = mean, geom = "point", shape = 20, size = 1) +
  stat_summary(fun.data = function(x) data.frame(y = max(x) + 0.1, label = length(x)),
               geom = "text", aes(label = ..label..), vjust = 0.5) +
  labs(y = "NDMI min", x = "EUNIS level 1") +
  guides(fill = FALSE, color = FALSE) +
  scale_x_discrete(labels = function(x) str_wrap(x, width = 15)) +
  scale_color_manual(values = c("below_20th" = "grey", "above_20th" = "lightblue")) +
  theme_bw() + coord_flip()

GPS (diff or not) valid points above p20 of NDVI_max and NDMI_min for each habitat

percentiles_all_GPS <- all_GPS_valid %>%
  group_by(EUNISa_1) %>%
  summarize(percentile_20_NDVI_max = quantile(NDVI_max, probs = 0.20, na.rm = T),
            percentile_20_NDMI_min = quantile(NDMI_min, probs = 0.20, na.rm = T))

all_GPS_valid <- all_GPS_valid %>%
  left_join(percentiles_all_GPS, by = "EUNISa_1") %>%
  mutate(category_NDVI_max = case_when(
    NDVI_max < percentile_20_NDVI_max ~ "below_20th",
    NDVI_max >= percentile_20_NDVI_max ~ "above_20th"),
  category_NDMI_min = case_when(
    NDMI_min < percentile_20_NDMI_min ~ "below_20th",
    NDMI_min >= percentile_20_NDMI_min ~ "above_20th"))

ggplot(data = all_GPS_valid,
       aes(x = EUNISa_1_descr, y = NDVI_max)) +
  geom_flat_violin(position = position_nudge(x = 0.2, y = 0), alpha = 0.8,
                   fill = "lightblue") +
  geom_point(aes(color = category_NDVI_max),
             position = position_jitter(width = 0.15), size = 1, alpha = 0.25) +
  geom_boxplot(width = 0.2, outlier.shape = NA, alpha = 0.5) +
  stat_summary(fun.y = mean, geom = "point", shape = 20, size = 1) +
  stat_summary(fun.data = function(x) data.frame(y = max(x) + 0.1, label = length(x)),
               geom = "text", aes(label = ..label..), vjust = 0.5) +
  labs(y = "NDVI max", x = "EUNIS level 1") +
  guides(fill = FALSE, color = FALSE) +
  scale_x_discrete(labels = function(x) str_wrap(x, width = 15)) +
  scale_color_manual(values = c("below_20th" = "grey", "above_20th" = "lightblue")) +
  theme_bw() + coord_flip()


ggplot(data = all_GPS_valid,
       aes(x = EUNISa_1_descr, y = NDMI_min)) +
  geom_flat_violin(position = position_nudge(x = 0.2, y = 0), alpha = 0.8,
                   fill = "lightblue") +
  geom_point(aes(color = category_NDMI_min),
             position = position_jitter(width = 0.15), size = 1, alpha = 0.25) +
  geom_boxplot(width = 0.2, outlier.shape = NA, alpha = 0.5) +
  stat_summary(fun.y = mean, geom = "point", shape = 20, size = 1) +
  stat_summary(fun.data = function(x) data.frame(y = max(x) + 0.1, label = length(x)),
               geom = "text", aes(label = ..label..), vjust = 0.5) +
  labs(y = "NDMI min", x = "EUNIS level 1") +
  guides(fill = FALSE, color = FALSE) +
  scale_x_discrete(labels = function(x) str_wrap(x, width = 15)) +
  scale_color_manual(values = c("below_20th" = "grey", "above_20th" = "lightblue")) +
  theme_bw() + coord_flip()

RF models

Using the conditional inference version of random forest (cforest in package party). Suggested if the data are highly correlated. Cforest is more stable in deriving variable importance values in the presence of highly correlated variables, thus providing better accuracy in calculating variable importance (ref below).

Hothorn, T., Hornik, K. and Zeileis, A. (2006) Unbiased Recursive Portioning: A Conditional Inference Framework. Journal of Computational and Graphical Statistics, 15, 651- 674. http://dx.doi.org/10.1198/106186006X133933

All GPS points above p20

Filter the data to get only GPS-points above p20 of NDVI_max and NDMI_min.

all_GPS_valid <- all_GPS_valid %>%
  select(-percentile_20_NDVI_max, -percentile_20_NDMI_min)
percentiles <- all_GPS_valid %>%
  group_by(EUNISa_1) %>%
  summarize(
    percentile_20_NDVI_max = quantile(NDVI_max, 0.20, na.rm = T),
    percentile_20_NDMI_min = quantile(NDMI_min, 0.20, na.rm = T),
    percentile_80_NDVI_max = quantile(NDVI_max, 0.80, na.rm = T),
    percentile_80_NDMI_min = quantile(NDMI_min, 0.80, na.rm = T)
    )

# Join the percentiles back to the original data
all_GPS_valid <- all_GPS_valid %>%
  left_join(percentiles, by = "EUNISa_1")

# Filter rows above the 20th percentile for both variables for each category of EUNISa_1
filtered_data1 <- all_GPS_valid %>%
  filter(
    NDVI_max >= percentile_20_NDVI_max & NDMI_min >= percentile_20_NDMI_min
    ) %>%
  filter(!is.na(NDVI_max) & !is.na(NDMI_max) & !is.na(NDWI_max) &
           !is.na(SAVI_max) & !is.na(EVI_max) & !is.na(NDVI_min) &
           !is.na(NDMI_min) & !is.na(NDWI_min) & !is.na(SAVI_min) &
           !is.na(EVI_min)) %>%
  mutate(EUNISa_1 = as.factor(EUNISa_1)) %>%
  filter(EVI_max <= 1 & EVI_min >= -1)

Split into training and test data sets.

train_indices1 <- sample(1:nrow(filtered_data1), 0.7 * nrow(filtered_data1))
train_data1 <- filtered_data1[train_indices1, ]
test_data1 <- filtered_data1[-train_indices1, ]

Number of points per category for filtered data:

filtered_data1 %>% count(EUNISa_1)

Investigate package ggparty (e.g. autoplot function, and more).

TO-DO: Choose the hyperparameter mtry based on the square root of the number of predictor variables (Hastie et al., 2009)-

Hastie, T., Tibshirani, R., & Friedman, J. (2009). The elements of statistical learning: Data mining, inference, and prediction. Springer Science & Business Media.

Maybe TO_DO: We variated ntree from 50 to 800 in steps of 50, leaving mtry constant at 2. Tis parameter variation showed that ntree=500 was optimal, while higher ntree led to no further model improvement (Supplementary Fig. S10). Subsequently, the hyperparameter mtry was varied from 2 to 8 with constant ntree=500. Here, mtry=3 led to the best results in almost all cases (Supplementary Fig. S11). Consequently, we chose ntree=500 and mtry=3 for our main analysis across all study sites.

rf_cforest1 <- party::cforest(EUNISa_1 ~ NDVI_max + NDVI_min + NDMI_max +
                                NDMI_min + NDWI_max + NDWI_min + EVI_max +
                                EVI_min + SAVI_max + SAVI_min + canopy_height, 
                              data = train_data1,
                              controls = cforest_control(
                                mtry = 3,
                                # mtry = sqrt(11)
                                # Default mtry = 5
                                # Bagging: mtry = NULL
                                # or = number of input variables
                                ntree = 500) # Default, try increasing
                              ) 
predictions_rf_cforest1 <- predict(rf_cforest1, newdata = test_data1,
                                   OOB = TRUE, type = "response")

Confusion matrix:

confusionMatrix(predictions_rf_cforest1, test_data1$EUNISa_1)
Confusion Matrix and Statistics

          Reference
Prediction    Q    R    S    T
         Q  638  327    0    0
         R  260 1491    1    0
         S    0    0   38    4
         T    0    0   12   56

Overall Statistics
                                          
               Accuracy : 0.7863          
                 95% CI : (0.7708, 0.8013)
    No Information Rate : 0.6431          
    P-Value [Acc > NIR] : < 2.2e-16       
                                          
                  Kappa : 0.566           
                                          
 Mcnemar's Test P-Value : NA              

Statistics by Class:

                     Class: Q Class: R Class: S Class: T
Sensitivity            0.7105   0.8201  0.74510  0.93333
Specificity            0.8305   0.7413  0.99856  0.99566
Pos Pred Value         0.6611   0.8510  0.90476  0.82353
Neg Pred Value         0.8604   0.6958  0.99533  0.99855
Prevalence             0.3177   0.6431  0.01804  0.02122
Detection Rate         0.2257   0.5274  0.01344  0.01981
Detection Prevalence   0.3414   0.6197  0.01486  0.02405
Balanced Accuracy      0.7705   0.7807  0.87183  0.96450

SurrogateTree –> does not work

varimp_rf_cforest1 <- party::varimp(rf_cforest1, conditional = F) 
# conditional = T adjusts for correlations between predictor variables
# Takes long!
# party::varimp(rf_cforest1, conditional = T) 
# conditional = T adjusts for correlations between predictor variables
# Takes long!

Variable Importance Plot

varimp_rf_cforest1_df <- data.frame(Variable = names(varimp_rf_cforest1),
                                    Importance = varimp_rf_cforest1)
ggplot(varimp_rf_cforest1_df,
       aes(x = reorder(Variable, Importance), y = Importance)) +
  geom_bar(stat = "identity", fill = "lightblue") +
  coord_flip() + theme_minimal() +
  labs(title = "Variable Importance", x = "Variables", y = "Importance")

Tree Visualization

# Create a single conditional inference tree using ctree
single_tree1 <- ctree(EUNISa_1 ~ NDVI_max + NDVI_min + NDMI_max + NDMI_min +
                       NDWI_max + NDWI_min + EVI_max + EVI_min + SAVI_max +
                       SAVI_min + canopy_height,
                     data = train_data1)

# Plot the single tree using
autoplot(single_tree1)

All GPS points within IQ range

Filter the data to get only GPS-points within IQ range of NDVI_max and NDMI_min.

IQ_ranges <- all_GPS_valid %>%
  group_by(EUNISa_1) %>%
  summarize(
    Q1_NDVI_max = quantile(NDVI_max, 0.25, na.rm = T),
    Q1_NDMI_min = quantile(NDMI_min, 0.25, na.rm = T),
    Q3_NDVI_max = quantile(NDVI_max, 0.75, na.rm = T),
    Q3_NDMI_min = quantile(NDMI_min, 0.75, na.rm = T)
    )

# Join the IQ ranges back to the original data
all_GPS_valid <- all_GPS_valid %>%
  left_join(IQ_ranges, by = "EUNISa_1")

# Filter rows within the IQR range for both variables
filtered_data2 <- all_GPS_valid %>%
  filter(
    (NDVI_max >= Q1_NDVI_max & NDVI_max <= Q3_NDVI_max) &
    (NDMI_min >= Q1_NDMI_min & NDMI_min <= Q3_NDMI_min)
    ) %>%
  filter(!is.na(NDVI_max) & !is.na(NDMI_max) & !is.na(NDWI_max) &
           !is.na(SAVI_max) & !is.na(EVI_max) & !is.na(NDVI_min) &
           !is.na(NDMI_min) & !is.na(NDWI_min) & !is.na(SAVI_min) &
           !is.na(EVI_min)) %>%
  mutate(EUNISa_1 = as.factor(EUNISa_1)) %>%
  filter(EVI_max <= 1 & EVI_min >= -1)

Split into training and test data sets.

train_indices2 <- sample(1:nrow(filtered_data2), 0.7 * nrow(filtered_data2))
train_data2 <- filtered_data2[train_indices2, ]
test_data2 <- filtered_data2[-train_indices2, ]

Number of points per category for filtered data:

filtered_data2 %>% count(EUNISa_1)
rf_cforest2 <- party::cforest(EUNISa_1 ~ NDVI_max + NDVI_min + NDMI_max +
                                NDMI_min + NDWI_max + NDWI_min + EVI_max +
                                EVI_min + SAVI_max + SAVI_min + canopy_height, 
                              data = train_data2,
                              controls = cforest_control(
                                mtry = 3,
                                # mtry = sqrt(11)
                                # Default mtry = 5
                                # Bagging: mtry = NULL
                                # or = number of input variables
                                ntree = 500) # Default, try increasing
                              ) 
predictions_rf_cforest2 <- predict(rf_cforest2, newdata = test_data2,
                                   OOB = TRUE, type = "response")

Confusion matrix:

confusionMatrix(predictions_rf_cforest2, test_data2$EUNISa_1)
Confusion Matrix and Statistics

          Reference
Prediction   Q   R   S   T
         Q 310   5   0   0
         R  43 885   0   0
         S   0   0  14   2
         T   0   0   6  22

Overall Statistics
                                         
               Accuracy : 0.9565         
                 95% CI : (0.9439, 0.967)
    No Information Rate : 0.6915         
    P-Value [Acc > NIR] : < 2.2e-16      
                                         
                  Kappa : 0.8997         
                                         
 Mcnemar's Test P-Value : NA             

Statistics by Class:

                     Class: Q Class: R Class: S Class: T
Sensitivity            0.8782   0.9944  0.70000  0.91667
Specificity            0.9946   0.8917  0.99842  0.99525
Pos Pred Value         0.9841   0.9537  0.87500  0.78571
Neg Pred Value         0.9558   0.9861  0.99528  0.99841
Prevalence             0.2743   0.6915  0.01554  0.01865
Detection Rate         0.2409   0.6876  0.01088  0.01709
Detection Prevalence   0.2448   0.7211  0.01243  0.02176
Balanced Accuracy      0.9364   0.9430  0.84921  0.95596
varimp_rf_cforest2 <- party::varimp(rf_cforest2, conditional = F) 

Variable Importance Plot

varimp_rf_cforest2_df <- data.frame(Variable = names(varimp_rf_cforest2),
                                    Importance = varimp_rf_cforest2)
ggplot(varimp_rf_cforest2_df,
       aes(x = reorder(Variable, Importance), y = Importance)) +
  geom_bar(stat = "identity", fill = "lightblue") +
  coord_flip() + theme_minimal() +
  labs(title = "Variable Importance", x = "Variables", y = "Importance")

All GPS points within mean +/- SD

Filter the data to get only GPS-points within mean +/- SD of NDVI_max and NDMI_min.

mean_sd <- all_GPS_valid %>%
  group_by(EUNISa_1) %>%
  summarize(
    mean_NDVI_max = mean(all_GPS_valid$NDVI_max, na.rm = T),
    mean_NDMI_min = mean(all_GPS_valid$NDMI_min, na.rm = T),
    sd_NDVI_max = sd(all_GPS_valid$NDVI_max, na.rm = T),
    sd_NDMI_min = sd(all_GPS_valid$NDMI_min, na.rm = T)
    )

# Join the IQ ranges back to the original data
all_GPS_valid <- all_GPS_valid %>%
  left_join(mean_sd, by = "EUNISa_1")

# Filter rows within the specified range for both variables
filtered_data3 <- all_GPS_valid %>%
  filter(
    (NDVI_max >= (mean_NDVI_max - sd_NDVI_max) & NDVI_max <= (mean_NDVI_max + sd_NDVI_max)) &
      (NDMI_min >= (mean_NDMI_min - sd_NDMI_min) & NDMI_min <= (mean_NDMI_min + sd_NDMI_min))
    ) %>%
  filter(!is.na(NDVI_max) & !is.na(NDMI_max) & !is.na(NDWI_max) &
           !is.na(SAVI_max) & !is.na(EVI_max) & !is.na(NDVI_min) &
           !is.na(NDMI_min) & !is.na(NDWI_min) & !is.na(SAVI_min) &
           !is.na(EVI_min)) %>%
  mutate(EUNISa_1 = as.factor(EUNISa_1)) %>%
  filter(EVI_max <= 1 & EVI_min >= -1)

Split into training and test data sets.

train_indices3 <- sample(1:nrow(filtered_data3), 0.7 * nrow(filtered_data3))
train_data3 <- filtered_data3[train_indices3, ]
test_data3 <- filtered_data3[-train_indices3, ]

Number of points per category for filtered data:

filtered_data3 %>% count(EUNISa_1)
rf_cforest3 <- party::cforest(EUNISa_1 ~ NDVI_max + NDVI_min + NDMI_max +
                                NDMI_min + NDWI_max + NDWI_min + EVI_max +
                                EVI_min + SAVI_max + SAVI_min + canopy_height, 
                              data = train_data3,
                              controls = cforest_control(
                                mtry = 3,
                                # mtry = sqrt(11)
                                # Default mtry = 5
                                # Bagging: mtry = NULL
                                # or = number of input variables
                                ntree = 500) # Default, try increasing
                              ) 
predictions_rf_cforest3 <- predict(rf_cforest3, newdata = test_data3,
                                   OOB = TRUE, type = "response")

Confusion matrix:

confusionMatrix(predictions_rf_cforest3, test_data3$EUNISa_1)
Confusion Matrix and Statistics

          Reference
Prediction    Q    R    S    T
         Q  264  132    0    0
         R  511 1283    0    0
         S    0    0   36    2
         T    0    0   13   53

Overall Statistics
                                          
               Accuracy : 0.7132          
                 95% CI : (0.6942, 0.7316)
    No Information Rate : 0.6168          
    P-Value [Acc > NIR] : < 2.2e-16       
                                          
                  Kappa : 0.3741          
                                          
 Mcnemar's Test P-Value : NA              

Statistics by Class:

                     Class: Q Class: R Class: S Class: T
Sensitivity            0.3406   0.9067  0.73469  0.96364
Specificity            0.9131   0.4187  0.99911  0.99419
Pos Pred Value         0.6667   0.7152  0.94737  0.80303
Neg Pred Value         0.7308   0.7360  0.99424  0.99910
Prevalence             0.3378   0.6168  0.02136  0.02398
Detection Rate         0.1151   0.5593  0.01569  0.02310
Detection Prevalence   0.1726   0.7820  0.01656  0.02877
Balanced Accuracy      0.6269   0.6627  0.86690  0.97892
varimp_rf_cforest3 <- party::varimp(rf_cforest3, conditional = F) 

Variable Importance Plot

varimp_rf_cforest3_df <- data.frame(Variable = names(varimp_rf_cforest3),
                                    Importance = varimp_rf_cforest3)
ggplot(varimp_rf_cforest3_df,
       aes(x = reorder(Variable, Importance), y = Importance)) +
  geom_bar(stat = "identity", fill = "lightblue") +
  coord_flip() + theme_minimal() +
  labs(title = "Variable Importance", x = "Variables", y = "Importance")

All GPS points above p20 and below p80

Filter the data to get only GPS-points above p20 and below p80 of NDVI_max and NDMI_min.

# Filter rows above the 20th percentile and below the 80th percentile for both variables
filtered_data4 <- all_GPS_valid %>%
  filter(
    (NDVI_max >= percentile_20_NDVI_max & NDVI_max <= percentile_80_NDVI_max) &
    (NDMI_min >= percentile_20_NDMI_min & NDMI_min <= percentile_80_NDMI_min)
    ) %>%
  filter(!is.na(NDVI_max) & !is.na(NDMI_max) & !is.na(NDWI_max) &
           !is.na(SAVI_max) & !is.na(EVI_max) & !is.na(NDVI_min) &
           !is.na(NDMI_min) & !is.na(NDWI_min) & !is.na(SAVI_min) &
           !is.na(EVI_min)) %>%
  mutate(EUNISa_1 = as.factor(EUNISa_1)) %>%
  filter(EVI_max <= 1 & EVI_min >= -1)

Split into training and test data sets.

train_indices4 <- sample(1:nrow(filtered_data4), 0.7 * nrow(filtered_data4))
train_data4 <- filtered_data4[train_indices4, ]
test_data4 <- filtered_data4[-train_indices4, ]

Number of points per category for filtered data:

filtered_data4 %>% count(EUNISa_1)
rf_cforest4 <- party::cforest(EUNISa_1 ~ NDVI_max + NDVI_min + NDMI_max +
                                NDMI_min + NDWI_max + NDWI_min + EVI_max +
                                EVI_min + SAVI_max + SAVI_min + canopy_height, 
                              data = train_data4,
                              controls = cforest_control(
                                mtry = 3,
                                # mtry = sqrt(11)
                                # Default mtry = 5
                                # Bagging: mtry = NULL
                                # or = number of input variables
                                ntree = 500) # Default, try increasing
                              ) 
predictions_rf_cforest4 <- predict(rf_cforest4, newdata = test_data4,
                                   OOB = TRUE, type = "response")

Confusion matrix:

confusionMatrix(predictions_rf_cforest4, test_data4$EUNISa_1)
Confusion Matrix and Statistics

          Reference
Prediction    Q    R    S    T
         Q  387   30    0    0
         R  149 1121    0    0
         S    0    0   17    1
         T    0    0   11   30

Overall Statistics
                                         
               Accuracy : 0.8906         
                 95% CI : (0.875, 0.9049)
    No Information Rate : 0.6592         
    P-Value [Acc > NIR] : < 2.2e-16      
                                         
                  Kappa : 0.7551         
                                         
 Mcnemar's Test P-Value : NA             

Statistics by Class:

                     Class: Q Class: R Class: S Class: T
Sensitivity            0.7220   0.9739 0.607143  0.96774
Specificity            0.9752   0.7496 0.999418  0.99359
Pos Pred Value         0.9281   0.8827 0.944444  0.73171
Neg Pred Value         0.8879   0.9370 0.993634  0.99941
Prevalence             0.3070   0.6592 0.016037  0.01775
Detection Rate         0.2216   0.6420 0.009737  0.01718
Detection Prevalence   0.2388   0.7274 0.010309  0.02348
Balanced Accuracy      0.8486   0.8618 0.803280  0.98066

HERE: Compare RF 1-4

Cordillera data

AlpineGrasslands_indices <- read_csv(
  "C:/Data/MOTIVATE/Cordillera/AlpineGrasslands/AlpineGrassland_Sentinel_Plot_Allyear_Allmetrics.csv")
Rows: 40 Columns: 69── Column specification ─────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr  (9): system:index, Codigo, H�bitat, Localidad, Precisi�n, Subregion, UTM, source, .geo
dbl (60): Altitude__, Altura_med, Altura_m�, Cobertur_1, Cobertur_2, Cobertura, DATE, EVI_max, EV...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
AlpineGrasslands_phen <- read_csv(
  "C:/Data/MOTIVATE/Cordillera/AlpineGrasslands/AlpineGrasslands_Phenology_SOS_EOS_Peak_NDVI_Amplitude.csv")
Rows: 40 Columns: 34── Column specification ─────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr  (8): system:index, Codigo, H�bitat, Localidad, Precisi�n, Subregion, UTM, .geo
dbl (26): Altitude__, Altura_med, Altura_m�, Cobertur_1, Cobertur_2, Cobertura, DATE, EOS_DOY, Gp...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
AlpineGrasslands_CH <- read_csv(
  "C:/Data/MOTIVATE/Cordillera/AlpineGrasslands/AlpineGrasslands_CanopyHeight_1m.csv")
Rows: 40 Columns: 27── Column specification ─────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr  (8): system:index, Codigo, Hábitat, Localidad, Precisión, Subregion, UTM, .geo
dbl (19): Altitude__, Altura_med, Altura_má, Cobertur_1, Cobertur_2, Cobertura, Date__year, Gps_e...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
VegetationTypes_indices <- read_csv(
  "C:/Data/MOTIVATE/Cordillera/VegetationTypes/VegetationTypes_Sentinel_Plot_AllYear_Allmetrics.csv")
Rows: 32 Columns: 54── Column specification ─────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr   (8): system:index, EUNIS, ID, TYPE, Xo, Yo, source, .geo
dbl  (45): EVI_max, EVI_median, EVI_min, EVI_mode, EVI_p10, EVI_p90, EVI_stdDev, EVImean, NDMI_ma...
date  (1): DATE
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
VegetationTypes_phen <- read_csv(
  "C:/Data/MOTIVATE/Cordillera/VegetationTypes/VegetationTypes_Phenology_SOS_EOS_Peak_NDVI_Amplitude.csv")
Rows: 32 Columns: 19── Column specification ─────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr   (7): system:index, EUNIS, ID, TYPE, Xo, Yo, .geo
dbl  (11): EOS_DOY, NDVI_Amplitude, NDVI_at_EOS, NDVI_at_Peak, NDVI_at_SOS, Peak_DOY, SOS_DOY, Se...
date  (1): DATE
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
VegetationTypes_CH <- read_csv(
  "C:/Data/MOTIVATE/Cordillera/VegetationTypes/VegetationTypes_CanopyHeight_1m.csv")
Rows: 32 Columns: 12── Column specification ─────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr  (7): system:index, EUNIS, ID, TYPE, Xo, Yo, .geo
dbl  (4): X, Y, canopy_height, evaluated
date (1): DATE
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
AlpineGrasslands <- AlpineGrasslands_indices %>%
  select(-`system:index`, -.geo, -Localidad) %>%
  rename(Hábitat = "H�bitat") %>% 
  full_join(AlpineGrasslands_phen  %>%
              select(-`system:index`, -.geo, -Localidad) %>%
              rename(Hábitat = "H�bitat")) %>%
  full_join(AlpineGrasslands_CH  %>%
              select(-`system:index`, -.geo, -Localidad)) %>%
  select(-Date__year, - `Precisi�n`) %>%
  mutate(DATE = ymd(DATE)) %>%
  rename(ID = "Releve_num") %>%
  mutate(ID = as.character(ID)) %>%
  mutate(layer = "AlpineGrasslands")
Joining with `by = join_by(Altitude__, Altura_med, `Altura_m�`, Cobertur_1, Cobertur_2, Cobertura, Codigo, DATE, Gps_error, Hgtavg_hl, Hgtmax_hl, Hábitat, Latitude, Longitude, Num_specie, Orientaci, Pendiente, `Precisi�n`, Releve_num, Subregion, UTM, X, Y)`Joining with `by = join_by(Altitude__, Altura_med, Cobertur_1, Cobertur_2, Cobertura, Codigo, Gps_error, Hgtavg_hl, Hgtmax_hl, Hábitat, Latitude, Longitude, Num_specie, Orientaci, Pendiente, Releve_num, Subregion, UTM, X, Y)`
VegetationTypes <- VegetationTypes_indices %>%
  select(-`system:index`, -.geo) %>%
  full_join(VegetationTypes_phen  %>%
              select(-`system:index`, -.geo)) %>%
  full_join(VegetationTypes_CH  %>%
              select(-`system:index`, -.geo)) %>%
  rename(Hábitat = "TYPE") %>%
  mutate(layer = "VegetationTypes")
Joining with `by = join_by(DATE, EUNIS, ID, TYPE, X, Xo, Y, Yo, evaluated)`Joining with `by = join_by(DATE, EUNIS, ID, TYPE, X, Xo, Y, Yo, evaluated)`

Merge both datasets:

cordillera <- bind_rows(
  AlpineGrasslands %>% select(DATE, ID, starts_with("NDMI"),
                              starts_with("NDVI"), Hábitat, "EOS_DOY",
                              "Peak_DOY", "SOS_DOY", "Season_Length",
                              "canopy_height", "layer"),
  VegetationTypes %>% select(DATE, ID, starts_with("NDMI"),
                              starts_with("NDVI"), Hábitat, "EOS_DOY",
                              "Peak_DOY", "SOS_DOY", "Season_Length",
                              "canopy_height", "layer")
  ) %>%
  mutate(EUNISa_1 = case_when(
    Hábitat = str_detect(Hábitat, "Pastizal|Cervunal|grassland|meadow") ~ "R",
    Hábitat = str_detect(Hábitat, "forest") ~ "T",
    Hábitat = str_detect(Hábitat, "Scrub|scrub|Shrubland|shrubland|shrub|Heathland") ~ "S",
    Hábitat = str_detect(Hábitat, "Suelo|Scree|scree|cliff") ~ "U",
    Hábitat = is.na(Hábitat) ~ "R",
    TRUE ~ NA_character_),
    EUNISa_1_descr = case_when(
      EUNISa_1 == "R" ~ "Grasslands",
      EUNISa_1 == "T" ~ "Forests and other wooded land",
      EUNISa_1 == "S" ~ "Heathlands, scrub and tundra",
      EUNISa_1 == "U" ~ "Inland habitats with no or little soil")
    )

Max. and min. NDVI, NDMI

plot_distr_NDVI_max_cordi <- distr_plot(cordillera,
                                             "NDVI_max", "NDVI max")
plot_distr_NDVI_min_cordi <- distr_plot(cordillera,
                                             "NDVI_min", "NDVI min")
plot_distr_NDMI_max_cordi <- distr_plot(cordillera,
                                             "NDMI_max", "NDMI max")
plot_distr_NDMI_min_cordi <- distr_plot(cordillera,
                                             "NDMI_min", "NDMI min")
plot_distr_NDVI_max_cordi

plot_distr_NDVI_min_cordi

plot_distr_NDMI_max_cordi

plot_distr_NDMI_min_cordi

OLD from here

Plots NDVI and NDMI

ggplot(data = cordi %>% filter(!is.na(NDVI_max)) %>%
         # Keep only forests, grasslands, shrublands and wetlands
         filter(EUNIS_1 %in% c("T", "R", "S", "Q")),
       aes(x = EUNIS_1, y = NDVI_max, fill = EUNIS_1)) +
  geom_flat_violin(position = position_nudge(x = 0.2, y = 0), alpha = 0.8) +
  geom_point(aes(y = NDVI_max, color = EUNIS_1),
             position = position_jitter(width = 0.15), size = 3,
             alpha = 0.5) +
  geom_boxplot(width = 0.2, outlier.shape = NA, alpha = 0.5) +
  stat_summary(fun.y=mean, geom="point", shape = 20, size = 3) +
  stat_summary(fun.data = function(x) data.frame(y = max(x) + 0.1,
                                                 label = length(x)),
               geom = "text", aes(label = ..label..), vjust = 0.5) +
  guides(fill = FALSE, color = FALSE) + theme_bw() + coord_flip()
Error: objeto 'cordi' no encontrado

Maps

Points differential GPS

Maps

Number of differential GPS points by Country:

Points ReSurvey

Maps

Number of ReSurvey points by Country:

Join

Session info

---
title: "Script to validate points in ReSurvey database using RS data"
subtitle: "Adding ATL_BENELUX and CON_NORDIC S2 data, some Landsat data, adding canopy height data"
author: "Alicia Valdés"
date: "`r format(Sys.time(), '%d %B %Y')`"
output:
  pdf_document: default
  html_notebook: default
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(warning = FALSE)
```

This R script is used to validate the points in the ReSurvey database using RS indicators (NDVI, NDMI, canopy height).

# Load libraries

```{r}
library(tidyverse)
library(here)
library(gridExtra)
library(readxl)
library(scales)
library(sf)
library(rnaturalearth)
library(dtplyr)
library(lme4)
library(lmerTest)
library(car)
library(ggeffects)
library(party)
library(partykit)
library(strucchange)
library(ggparty)
library(caret)
library(moreparty)
library(randomForest)
```

# Define printall function

```{r}
printall <- function(tibble) {
  print(tibble, width = Inf)
  }
```

# Load geom_flat_violin plot

```{r}
source("https://gist.githubusercontent.com/benmarwick/2a1bb0133ff568cbe28d/raw/fb53bd97121f7f9ce947837ef1a4c65a73bffb3f/geom_flat_violin.R")
```

# Read ReSurvey data with RS indicators

```{r}
db_resurv_RS<-read_tsv(
  here("data", "clean", "db_resurv_RS_20250505.csv"),
  col_types = cols(
    # Dynamically specify EUNIS columns as character
    .default = col_guess(),  # Default guessing for other columns
    EUNISa = col_character(),
    EUNISb = col_character(),
    EUNISc = col_character(),
    EUNISd = col_character(),
    EUNISa_1 = col_character(),
    EUNISa_2 = col_character(),
    EUNISa_3 = col_character(),
    EUNISa_4 = col_character(),
    EUNISb_1 = col_character(),
    EUNISb_2 = col_character(),
    EUNISb_3 = col_character(),
    EUNISb_4 = col_character(),
    EUNISc_1 = col_character(),
    EUNISc_2 = col_character(),
    EUNISc_3 = col_character(),
    EUNISc_4 = col_character(),
    EUNISd_1 = col_character(),
    EUNISd_2 = col_character(),
    EUNISd_3 = col_character(),
    EUNISd_4 = col_character(),
    EUNISa_1_descr = col_character(),
    EUNISb_1_descr = col_character(),
    EUNISc_1_descr = col_character(),
    EUNISd_1_descr = col_character(),
    EUNIS_assignation = col_character(),
    EUNISa_2_descr = col_character(),
    EUNISa_3_descr = col_character(),
    EUNISa_4_descr = col_character(),
    EUNISb_2_descr = col_character(),
    EUNISb_3_descr = col_character(),
    EUNISb_4_descr = col_character(),
    EUNISc_2_descr = col_character(),
    EUNISc_3_descr = col_character(),
    EUNISc_4_descr = col_character(),
    EUNISd_2_descr = col_character(),
    EUNISd_3_descr = col_character(),
    EUNISd_4_descr = col_character()
    )
  )
```

No parsing issues!

# Some data managenemt

## Several EUNIS level 1 assigned

Number of rows where there is more than one EUNIS 1 assigned, and they are different among them. See what to do with these later! So far I take EUNISa_1.

```{r}
nrow(db_resurv_RS %>% 
       # Rows with more than one EUNIS 1 assigned
       filter(!is.na(EUNISb_1)) %>% 
       filter(EUNISa_1!=EUNISb_1 | EUNISb_1 != EUNISc_1 | EUNISa_1 != EUNISc_1))
```

See "confusions":

```{r}
db_resurv_RS %>% 
  # Rows with more than one EUNIS 1 assigned
  filter(!is.na(EUNISb_1)) %>% 
  filter(EUNISa_1!=EUNISb_1 | EUNISb_1 != EUNISc_1 | EUNISa_1 != EUNISc_1) %>%
  distinct(EUNISa_1, EUNISb_1, EUNISc_1, EUNISd_1)
```

Define "confusion" columns:

```{r}
db_resurv_RS <- db_resurv_RS %>%
  mutate(EUNIS1_conf_type = case_when(
    EUNISa_1 == "R" & EUNISb_1 == "S" ~ "R/S",
    EUNISa_1 == "S" & EUNISb_1 == "T" ~ "S/T",
    EUNISa_1 == "R" & EUNISb_1 == "R" & EUNISc_1 == "S" ~ "R/S",
    EUNISa_1 == "R" & EUNISb_1 == "R" & EUNISc_1 == "S" & EUNISd_1 == "S" ~ "R/S",
    EUNISa_1 == "P" & EUNISb_1 == "Q" ~ "P/Q",
    TRUE ~ NA_character_),
    EUNIS1_conf = !is.na(EUNIS1_conf_type))
```


## Tibble with selected columns

```{r}
db_resurv_RS_short <- db_resurv_RS %>%
  select(PlotObservationID, Country, RS_CODE, `ReSurvey site`, `ReSurvey plot`,
         `Manipulate (y/n)`, `Type of manipulation`, Lon_updated, Lat_updated,
         `Location method`, `Location uncertainty (m)`, EUNISa_1,
         EUNISa_1_descr, EUNISa_2, EUNISa_2_descr, EUNISa_3, EUNISa_3_descr,
         EUNISa_4, EUNISa_4_descr, EUNIS1_conf, EUNIS1_conf_type,
         date, year, biogeo, unit, year_RS, Lon_RS, Lat_RS, 
         starts_with("NDVI"), starts_with("NDMI"), starts_with("NDWI"),
         starts_with("EVI"), starts_with("SAVI"), canopy_height,
         SOS_DOY, SOS_date, NDVI_at_SOS, Peak_DOY, Peak_date, NDVI_at_Peak,
         EOS_DOY, EOS_date, NDVI_at_EOS, Season_Length,
         S2_data, Landsat_data, CH_data, S2_phen_data)
```

## TO-DO: Missing data checks

Do when all RS data is ready!

## Flag when year is different between RS data and ReSurvey db

```{r}
db_resurv_RS_short <- db_resurv_RS_short %>%
  mutate(year_diff = year != year_RS)
```

```{r}
db_resurv_RS_short %>% count(year_diff)
```

2 with different year. RS indices would need to be calculated again for those.

## Flag when coordinates are different between RS data and ReSurvey db

```{r}
db_resurv_RS_short <- db_resurv_RS_short %>%
  mutate(Lon_diff = case_when(Lon_updated == Lon_RS ~ "NO",
                              # Sometimes they are only slighly different
                              abs(Lon_updated - Lon_RS) < 0.01 ~ "SMALL",
                              is.na(Lon_updated) | is.na(Lon_RS) ~ NA,
                              TRUE ~ "LARGE"),
         Lat_diff = case_when(Lat_updated == Lat_RS ~ "NO",
                              # Sometimes they are only slighly different
                              abs(Lat_updated - Lat_RS) < 0.01 ~ "SMALL",
                              is.na(Lat_updated) | is.na(Lat_RS) ~ NA,
                              TRUE ~ "LARGE"))
```

```{r}
db_resurv_RS_short %>% count(Lon_diff)
db_resurv_RS_short %>% count(Lat_diff)
```

Very few with large differences (2 for longitude, 6 for latitude). RS indices would need to be calculated again for those.

If year_diff is TRUE or Lon_diff is LARGE or Lat_diff is LARGE --> RS indices need to be recalculated. So far, remove those rows from db_resurv_RS_short.

```{r}
db_resurv_RS_short <- db_resurv_RS_short %>%
  filter(year_diff == FALSE | is.na(year_diff)) %>%
  filter(Lon_diff == "NO" |Lon_diff == "SMALL" | is.na(Lon_diff)) %>%
  filter(Lat_diff == "NO" |Lat_diff == "SMALL" | is.na(Lat_diff))
```

## TO-DO: Recalculate RS indices for those above

## Handle plots that have more than one obs per year

Add column PLOT to data to identify unique plots:

```{r}
db_resurv_RS_short_PLOT <- db_resurv_RS_short %>%
  # Original names give problems, create new vars
  mutate(RS_site = `ReSurvey site`, RS_plot = `ReSurvey plot`) %>%
  # Convert to data.table for faster processing
  lazy_dt() %>%
  # Group by the 3 vars that uniquely identify each pl ot
  group_by(RS_CODE, RS_site, RS_plot) %>%
  # Create a new variable PLOT for each group
  mutate(PLOT = .GRP) %>%
  # Convert back to tibble
  as_tibble() %>%
  # Remove unneeded vars
  select(-RS_site, -RS_plot)
```

There should be only one observation of each plot per year.

Plots where there is at least a year with more than one observation, and where those observations have a different EUNIS assigned:

```{r}
plots_to_remove <- db_resurv_RS_short_PLOT %>%
  group_by(PLOT, year) %>%
  summarize(EUNISa_1_n = n_distinct(EUNISa_1, na.rm = TRUE)) %>%
  ungroup() %>% 
  filter(EUNISa_1_n > 1) %>%
  distinct(PLOT)
```

Remove plots_to_remove from the database:

```{r}
db_resurv_RS_short_PLOT <- db_resurv_RS_short_PLOT %>%
  anti_join(plots_to_remove, by = "PLOT")
```

Plots and years where there is more than one observation:

```{r}
plots_to_merge <- db_resurv_RS_short_PLOT %>%
  group_by(PLOT, year) %>%
  # Plots that have more than one observation per year
  filter(n() > 1) %>%
  ungroup() %>%
  distinct(PLOT)
```

Summarize plots_to_merge:

```{r}
plots_to_merge_summ <- db_resurv_RS_short_PLOT %>%
  group_by(PLOT, year) %>%
  # Plots that have more than one observation per year
  filter(n() > 1) %>%
  mutate(obs_num = row_number()) %>%
  pivot_wider(
    names_from = obs_num,
    values_from = c(date, PlotObservationID),
    names_prefix = "obs_"
  ) %>%
  arrange(PLOT) %>%
  summarize(
    across(c(Country, RS_CODE, `ReSurvey site`, `ReSurvey plot`,
             `Manipulate (y/n)`, `Type of manipulation`, Lon_updated,
             Lat_updated, `Location method`, `Location uncertainty (m)`,
             EUNISa_1, EUNISa_1_descr, EUNISa_2, EUNISa_2_descr, EUNISa_3,
             EUNISa_3_descr, EUNISa_4, EUNISa_4_descr, EUNIS1_conf,
             EUNIS1_conf_type, biogeo, unit, year_RS, Lon_RS, Lat_RS, NDVI_max,
             NDVI_median, NDVI_min, NDVI_mode, NDVI_p10, NDVI_p90, NDMI_max,
             NDMI_median, NDMI_min, NDMI_mode, NDMI_p10, NDMI_p90, NDWI_max,
             NDWI_median, NDWI_min, NDWI_mode, NDWI_p10, NDWI_p90, EVI_max,
             EVI_median, EVI_min, EVI_mode, EVI_p10, EVI_p90, SAVI_max,
             SAVI_median, SAVI_min, SAVI_mode, SAVI_p10, SAVI_p90,
             canopy_height, SOS_DOY, SOS_date, Peak_DOY, Peak_date, EOS_DOY,
             EOS_date, S2_data, Landsat_data, CH_data, S2_phen_data, year_diff,
             Lon_diff, Lat_diff), first),
    across(starts_with("date_obs_"), min),
    across(starts_with("PlotObservationID_obs_"), min)
    ) %>%
  ungroup()
```

Remove plots_to_merge from the database:

```{r}
db_resurv_RS_short_PLOT <- db_resurv_RS_short_PLOT %>%
  anti_join(plots_to_merge, by = "PLOT")
```

And add plots_to_merge_summ, where each plot and year only has one row:

```{r}
db_resurv_RS_short_PLOT <- bind_rows(db_resurv_RS_short_PLOT,
                                     plots_to_merge_summ)
```

Check that there is only one row per plot and per year:

```{r}
db_resurv_RS_short_PLOT %>%
  group_by(PLOT, year) %>%
  # Plots that have more than one observation per year
  filter(n() > 1) 
```

So, to sum up what I have done:

- Plots where there is at least a year with more than one observation, and where those observations have a different EUNIS assigned: Plots REMOVED from the data
- Plots where there is more than one observation, but observations have the same EUNIS assigned: kept in the data. Merged so that there is only one row per year. Info about the different dates (when different) is kept in columns date_obs_1 - date_obs_40, and info about the different PlotObservationID is kept in the columns PlotObservationID_obs_1 - PlotObservationID_obs_40.

### Save to clean data

Save clean file for analyses (to be updated continuously due to updates in ReSurvey database and updates on RS data).

```{r}
write_tsv(db_resurv_RS_short_PLOT,
          here("data", "clean","db_resurv_RS_short_PLOT_20250505.csv"))
```

# Distributions all bioregions

```{r}
# Define a function to create histograms
plot_histogram <- function(data, x_var, x_label) {
  ggplot(data %>%
           filter(EUNISa_1 %in% c("T", "R", "S", "Q")),
         aes(x = !!sym(x_var))) +
    geom_histogram(color = "black", fill = "white") +
    labs(x = x_label, y = "Frequency") +
    theme_bw()
}
```

```{r}
# Define a function to create plots with violin + boxplot + points
distr_plot <- function(data, y_vars, y_labels) {
  for (i in seq_along(y_vars)) {
    y_var <- y_vars[[i]]
    y_label <- y_labels[[i]]
    
    p <- ggplot(data = data %>%
                  filter(EUNISa_1 %in% c("T", "R", "S", "Q")),
                aes(x = EUNISa_1_descr, y = !!sym(y_var), fill = EUNISa_1_descr)) +
      geom_flat_violin(position = position_nudge(x = 0.2, y = 0), alpha = 0.8) +
      geom_point(aes(y = !!sym(y_var), color = EUNISa_1_descr),
                 position = position_jitter(width = 0.15), size = 1, alpha = 0.25) +
      geom_boxplot(width = 0.2, outlier.shape = NA, alpha = 0.5) +
      stat_summary(fun.y = mean, geom = "point", shape = 20, size = 1) +
      stat_summary(fun.data = function(x) data.frame(y = max(x) + 0.1,
                                                     label = length(x)),
                   geom = "text", aes(label = ..label..), vjust = 0.5) +
      labs(y = y_label, x = "EUNIS level 1") +
      scale_x_discrete(labels = function(x) str_wrap(x, width = 15)) +
      guides(fill = FALSE, color = FALSE) +
      theme_bw() + coord_flip()
    
    print(p)
  }
}
```

## HERE: Metrics calculated only for July, not for all year!

## NDVI, NDMI, NDWI, SAVI and EVI

Histograms to check that values are ok:

```{r}
hist_NDVI <- plot_histogram(db_resurv_RS_short_PLOT, "NDVI_max", "NDVI max")
hist_NDMI <- plot_histogram(db_resurv_RS_short_PLOT, "NDMI_max", "NDMI max")
hist_NDWI <- plot_histogram(db_resurv_RS_short_PLOT, "NDWI_max", "NDWI max")
hist_SAVI <- plot_histogram(db_resurv_RS_short_PLOT, "SAVI_max", "SAVI max")
hist_EVI <- plot_histogram(db_resurv_RS_short_PLOT, "EVI_max", "EVI max")
hist_NDVI
hist_NDMI
hist_NDWI
hist_SAVI
hist_EVI # Some values wrong!
hist_EVI_1 <- plot_histogram(db_resurv_RS_short_PLOT %>% filter(EVI_max <= 1),
                             "EVI_max", "EVI max")
hist_EVI_1
```

```{r}
nrow(db_resurv_RS_short_PLOT %>%
       filter(EUNISa_1 %in% c("T", "R", "S", "Q")) %>%
       filter(EVI_max > 1))
db_resurv_RS_short_PLOT %>%
       filter(EUNISa_1 %in% c("T", "R", "S", "Q"))%>%
  filter(EVI_max > 1) %>%
  count(biogeo, unit)
```

So far, remove observations with EVI_max > 1 from stuff using EVI values. 

Distribution plots:

```{r}
distr_plot(db_resurv_RS_short_PLOT,
           c("NDVI_max", "NDVI_p90", "NDVI_min", "NDVI_p10"), 
           c("NDVI max", "NDVI p90", "NDVI min", "NDVI p10"))
distr_plot(db_resurv_RS_short_PLOT,
           c("NDMI_max", "NDMI_p90", "NDMI_min", "NDMI_p10"), 
           c("NDMI max", "NDMI p90", "NDMI min", "NDMI p10"))
distr_plot(db_resurv_RS_short_PLOT,
           c("NDWI_max", "NDWI_p90", "NDWI_min", "NDWI_p10"), 
           c("NDWI max", "NDWI p90", "NDWI min", "NDWI p10"))
distr_plot(db_resurv_RS_short_PLOT,
           c("SAVI_max", "SAVI_p90", "SAVI_min", "SAVI_p10"), 
           c("SAVI max", "SAVI p90", "SAVI min", "SAVI p10"))
distr_plot(db_resurv_RS_short_PLOT %>% filter(EVI_max <= 1),
           c("EVI_max", "EVI_p90", "EVI_min", "EVI_p10"), 
           c("EVI max", "EVI p90", "EVI min", "EVI p10"))
```

## CH

```{r}
plot_distr_CH <- distr_plot(db_resurv_RS_short_PLOT, "canopy_height",
                            "Canopy height (m)")
plot_distr_CH
```
 
### Show habitats with CH categories

```{r}
ggplot(db_resurv_RS_short_PLOT %>%
         # Keep only forests, grasslands, shrublands and wetlands
         filter(EUNISa_1 %in% c("T", "R", "S", "Q")) %>%
         mutate(CH_cat =
                  factor(
                    case_when(canopy_height == 0 ~ "0 m",
                              canopy_height > 0 & canopy_height <= 1 ~ "0-1 m",
                              canopy_height > 1 & canopy_height <=2 ~ "1-2 m",
                              canopy_height > 2 & canopy_height <=5 ~ "2-5 m",
                              canopy_height > 5 & canopy_height <=8 ~ "5-8 m",
                              canopy_height > 8 ~ "> 8 m",
                              is.na(canopy_height) ~ NA_character_),
                    levels = c(
                      "0 m", "0-1 m", "1-2 m", "2-5 m", "5-8 m", "> 8 m"))),
       aes(x = EUNISa_1_descr, fill = CH_cat)) +
  geom_bar() + theme_bw() + coord_flip() +
  scale_y_continuous(labels = label_number()) +
  scale_fill_viridis_d(direction = -1) +
  labs(x = "EUNIS level 1", fill = "Canopy height") +
  scale_x_discrete(labels = function(x) str_wrap(x, width = 15)) +
  theme(legend.position = c(0.8, 0.75),
        legend.direction = "vertical")
```

### Stats per habitat type

```{r}
db_resurv_RS_short_PLOT %>%
  # Keep only forests, grasslands, shrublands and wetlands
  filter(EUNISa_1 %in% c("T", "R", "S", "Q")) %>%
  group_by(EUNISa_1_descr) %>%
  summarise(across(canopy_height, list(
    mean = mean,
    median = median,
    sd = sd,
    min = min,
    max = max
    ), na.rm = TRUE))
```

## Phenology

### Calculate metrics

```{r}
db_resurv_RS_short_PLOT <- db_resurv_RS_short_PLOT %>%
  mutate(
    # Difference NDVI between Peak and SOS
    diff_Peak_SOS = NDVI_at_Peak - NDVI_at_SOS,
    # Difference NDVI between Peak and EOS
    diff_Peak_EOS = NDVI_at_Peak - NDVI_at_EOS)
```

### Histograms phenology measures

```{r}
ggplot(data = db_resurv_RS_short_PLOT %>%
         # Keep only forests, grasslands, shrublands and wetlands
         filter(EUNISa_1 %in% c("T", "R", "S", "Q") & S2_phen_data == T) %>%
         pivot_longer(cols = c(SOS_DOY, Peak_DOY, EOS_DOY), names_to = "name",
                      values_to = "value"),
       aes(x = value)) +
  geom_histogram(fill = "white", color = "black") +
  facet_grid(biogeo ~ name, scales = "free_y") +
  theme_bw()
ggplot(data = db_resurv_RS_short_PLOT %>%
         # Keep only forests, grasslands, shrublands and wetlands
         filter(EUNISa_1 %in% c("T", "R", "S", "Q") & S2_phen_data == T) %>%
         pivot_longer(cols = c(NDVI_at_SOS, NDVI_at_Peak, NDVI_at_EOS),
                      names_to = "name", values_to = "value"),
       aes(x = value)) +
  geom_histogram(fill = "white", color = "black") +
  facet_grid(biogeo ~ name, scales = "free_y") +
  theme_bw()
ggplot(data = db_resurv_RS_short_PLOT %>%
         # Keep only forests, grasslands, shrublands and wetlands
         filter(EUNISa_1 %in% c("T", "R", "S", "Q") & S2_phen_data == T) %>%
         pivot_longer(cols = c(diff_Peak_SOS, diff_Peak_EOS),
                      names_to = "name", values_to = "value"),
       aes(x = value)) +
  geom_histogram(fill = "white", color = "black") +
  facet_grid(biogeo ~ name, scales = "free_y") +
  theme_bw()
ggplot(data = db_resurv_RS_short_PLOT %>%
         # Keep only forests, grasslands, shrublands and wetlands
         filter(EUNISa_1 %in% c("T", "R", "S", "Q") & S2_phen_data == T) %>%
         pivot_longer(cols = c(Season_Length),
                      names_to = "name", values_to = "value"),
       aes(x = value)) +
  geom_histogram(fill = "white", color = "black") +
  facet_grid(biogeo ~ name, scales = "free_y") +
  theme_bw()
```

### Distributions

```{r}
plot_distr_SOS_DOY <- distr_plot(db_resurv_RS_short_PLOT, "SOS_DOY",
                                 "SOS DOY")
plot_distr_Peak_DOY <- distr_plot(db_resurv_RS_short_PLOT, "Peak_DOY",
                                  "Peak DOY")
plot_distr_EOS_DOY <- distr_plot(db_resurv_RS_short_PLOT, "EOS_DOY",
                                 "EOS DOY")
plot_distr_NDVI_at_SOS <- distr_plot(db_resurv_RS_short_PLOT, "NDVI_at_SOS",
                                     "NDVI at SOS")
plot_distr_NDVI_at_Peak <- distr_plot(db_resurv_RS_short_PLOT, "NDVI_at_Peak",
                                      "NDVI at Peak")
plot_distr_NDVI_at_EOS <- distr_plot(db_resurv_RS_short_PLOT, "NDVI_at_EOS",
                                     "NDVI at EOS")
plot_distr_diff_Peak_SOS <- distr_plot(db_resurv_RS_short_PLOT, "diff_Peak_SOS",
                                       "Difference Peak-SOS")
plot_distr_diff_Peak_EOS <- distr_plot(db_resurv_RS_short_PLOT, "diff_Peak_EOS",
                                       "Difference Peak-EOS")
plot_distr_Season_Length <- distr_plot(db_resurv_RS_short_PLOT, "Season_Length",
                                       "Season Length")
plot_distr_SOS_DOY
plot_distr_Peak_DOY
plot_distr_EOS_DOY
plot_distr_NDVI_at_SOS
plot_distr_NDVI_at_Peak
plot_distr_NDVI_at_EOS
plot_distr_diff_Peak_SOS
plot_distr_diff_Peak_EOS
plot_distr_Season_Length
```

# Distributions per bioregion

```{r}
# Define a function to create plots with violin + boxplot + points
distr_plot_biogeo <- function(data, y_var, y_label) {
  ggplot(data = data %>%
           filter(EUNISa_1 %in% c("T", "R", "S", "Q")),
         aes(x = EUNISa_1_descr, y = !!sym(y_var), fill = EUNISa_1_descr)) +
    geom_flat_violin(position = position_nudge(x = 0.2, y = 0), alpha = 0.8) +
    geom_point(aes(y = !!sym(y_var), color = EUNISa_1_descr),
               position = position_jitter(width = 0.15), size = 1, alpha = 0.25) +
    geom_boxplot(width = 0.2, outlier.shape = NA, alpha = 0.5) +
    stat_summary(fun.y = mean, geom = "point", shape = 20, size = 1) +
    stat_summary(fun.data = function(x) data.frame(y = max(x) + 0.1,
                                                   label = length(x)),
                 geom = "text", aes(label = ..label..), vjust = 0.5) +
    labs(y = y_label, x = "EUNISa_1_descr") +
    scale_x_discrete(labels = function(x) str_wrap(x, width = 15)) +
    guides(fill = FALSE, color = FALSE) +
    theme_bw() + coord_flip() + facet_wrap(~ biogeo)
}
```

## Max. NDVI, NDMI, NDWI, SAVI and EVI

Distribution plots:

```{r}
plot_distr_NDVI_biogeo <- distr_plot_biogeo(db_resurv_RS_short_PLOT %>%
                                              filter(!is.na(biogeo)),
                                            "NDVI_max", "NDVI max")
plot_distr_NDMI_biogeo <- distr_plot_biogeo(db_resurv_RS_short_PLOT %>%
                                              filter(!is.na(biogeo)),
                                            "NDMI_max", "NDMI max")
plot_distr_NDWI_biogeo <- distr_plot_biogeo(db_resurv_RS_short_PLOT %>%
                                              filter(!is.na(biogeo)),
                                            "NDWI_max", "NDWI max")
plot_distr_SAVI_biogeo <- distr_plot_biogeo(db_resurv_RS_short_PLOT %>%
                                              filter(!is.na(biogeo)),
                                            "SAVI_max", "SAVI max")
plot_distr_EVI_biogeo <- distr_plot_biogeo(db_resurv_RS_short_PLOT %>%
                                              filter(!is.na(biogeo)) %>%
                                             filter(EVI_max <= 1),
                                           "EVI_max", "EVI max")
plot_distr_NDVI_biogeo
plot_distr_NDMI_biogeo
plot_distr_NDWI_biogeo
plot_distr_SAVI_biogeo
plot_distr_EVI_biogeo
```

BOR missing because there is no EUNIS info for those that have RS info.

## CH

```{r}
plot_distr_CH_biogeo <- distr_plot_biogeo(db_resurv_RS_short_PLOT,
                                          "canopy_height", "Canopy height (m)")
plot_distr_CH_biogeo
```

In this plot, those with biogeo = NA are those that do not have S2 or Landsat data (and thus biogeo has not been assigned), but have CH data. We should later assign a biogeo based on location. 

## Phenology

```{r}
plot_distr_SOS_DOY_biogeo <- distr_plot_biogeo(db_resurv_RS_short_PLOT %>%
                                              filter(!is.na(biogeo)),
                                               "SOS_DOY", "SOS DOY")
plot_distr_Peak_DOY_biogeo <- distr_plot_biogeo(db_resurv_RS_short_PLOT %>%
                                              filter(!is.na(biogeo)),
                                                "Peak_DOY", "Peak DOY")
plot_distr_EOS_DOY_biogeo <- distr_plot_biogeo(db_resurv_RS_short_PLOT %>%
                                              filter(!is.na(biogeo)),
                                               "EOS_DOY", "EOS DOY")
plot_distr_NDVI_at_SOS_biogeo <- distr_plot_biogeo(db_resurv_RS_short_PLOT %>%
                                              filter(!is.na(biogeo)),
                                               "NDVI_at_SOS", "NDVI at SOS")
plot_distr_NDVI_at_Peak_biogeo <- distr_plot_biogeo(db_resurv_RS_short_PLOT %>%
                                              filter(!is.na(biogeo)),
                                                "NDVI_at_Peak", "NDVI at Peak")
plot_distr_NDVI_at_EOS_biogeo <- distr_plot_biogeo(db_resurv_RS_short_PLOT %>%
                                              filter(!is.na(biogeo)),
                                               "NDVI_at_EOS", "NDVI at EOS")
plot_distr_diff_Peak_SOS_biogeo <- distr_plot_biogeo(db_resurv_RS_short_PLOT %>%
                                              filter(!is.na(biogeo)),
                                               "diff_Peak_SOS", "Difference Peak-SOS")
plot_distr_diff_Peak_EOS_biogeo <- distr_plot_biogeo(db_resurv_RS_short_PLOT %>%
                                              filter(!is.na(biogeo)),
                                               "diff_Peak_EOS", "Difference Peak-EOS")
plot_distr_Season_Length_biogeo <- distr_plot_biogeo(db_resurv_RS_short_PLOT %>%
                                              filter(!is.na(biogeo)),
                                               "Season_Length", "Season Length")

plot_distr_SOS_DOY_biogeo
plot_distr_Peak_DOY_biogeo
plot_distr_EOS_DOY_biogeo
plot_distr_NDVI_at_SOS_biogeo
plot_distr_NDVI_at_Peak_biogeo
plot_distr_NDVI_at_EOS_biogeo
plot_distr_diff_Peak_SOS_biogeo
plot_distr_diff_Peak_EOS_biogeo
plot_distr_Season_Length_biogeo
```

BOR missing because there is no EUNIS info for those that have RS info.

# HERE: Verify SOS-Peak_EOS ODY

ERRORS! Double-check and send to Bea:

```{r}
db_resurv_RS_short_PLOT %>% filter(SOS_DOY > Peak_DOY)
db_resurv_RS_short_PLOT %>% filter(Peak_DOY > EOS_DOY)
db_resurv_RS_short_PLOT %>% filter(SOS_DOY > EOS_DOY)
```
```{r}
db_resurv_RS_short_PLOT %>% filter(NDVI_at_Peak < NDVI_at_SOS)
db_resurv_RS_short_PLOT %>% filter(NDVI_at_Peak < NDVI_at_EOS)
```

# Comparison differential GPS vs. other points

```{r}
# Define function
distr_plot_GPS <- function(data, y_var, y_label) {
  ggplot(data = data %>%
           filter(EUNISa_1 %in% c("T", "R", "S", "Q")) %>%
           mutate(GPS = 
                    ifelse(
                      is.na(`Location method`) |
                        `Location method` != "Location with differential GPS",
                      "Other",
                      "Differential GPS")),
         aes(x = EUNISa_1_descr, y = !!sym(y_var), fill = EUNISa_1_descr)) +
    geom_flat_violin(position = position_nudge(x = 0.2, y = 0), alpha = 0.8) +
    geom_point(aes(y = !!sym(y_var), color = EUNISa_1_descr),
               position = position_jitter(width = 0.15), size = 1, alpha = 0.25) +
    geom_boxplot(width = 0.2, outlier.shape = NA, alpha = 0.5) +
    stat_summary(fun.y = mean, geom = "point", shape = 20, size = 1) +
    stat_summary(fun.data = function(x) data.frame(y = max(x) + 0.1,
                                                   label = length(x)),
                 geom = "text", aes(label = ..label..), vjust = 0.5) +
    labs(y = y_label, x = "EUNIS level 1") +
    scale_x_discrete(labels = function(x) str_wrap(x, width = 15)) +
    guides(fill = FALSE, color = FALSE) +
    theme_bw() + coord_flip() + facet_wrap(~ GPS)
}
```

```{r}
plot_distr_NDVI_GPS <- distr_plot_GPS(db_resurv_RS_short_PLOT,
                                      "NDVI_max", "NDVI max")
plot_distr_NDMI_GPS <- distr_plot_GPS(db_resurv_RS_short_PLOT,
                                      "NDMI_max", "NDMI max")
plot_distr_NDWI_GPS <- distr_plot_GPS(db_resurv_RS_short_PLOT,
                                      "NDWI_max", "NDWI max")
plot_distr_SAVI_GPS <- distr_plot_GPS(db_resurv_RS_short_PLOT,
                                      "SAVI_max", "SAVI max")
plot_distr_EVI_GPS <- distr_plot_GPS(db_resurv_RS_short_PLOT %>%
                                       filter(EVI_max <= 1),
                                      "EVI_max", "EVI max")
plot_distr_NDVI_GPS
plot_distr_NDMI_GPS
plot_distr_NDWI_GPS
plot_distr_SAVI_GPS
plot_distr_EVI_GPS
```

# Including PLOT (USE LATER?)

Summarize variables by plot:

```{r}
db_resurv_RS_short_PLOT_summ <- db_resurv_RS_short_PLOT %>%
  filter(EUNISa_1 %in% c("T", "R", "S", "Q")) %>%
  filter(S2_data == T | Landsat_data == T) %>%
  group_by(PLOT) %>%
  summarize(EUNIS1 = if_else(n_distinct(EUNISa_1) > 1, "Change",
                             unique(EUNISa_1)[1]),
            count = n(),
            across(starts_with("NDVI"), list(mean = mean, sd = sd),
                   .names = "{col}_{fn}"))

```

Maybe use later because now many plots have only one observation, probably because some Landsat data is missing?

# First validation

For T, R, S, Q habitats.

Define a set of rules for a first validation of ALL ReSurvey data. We can call these "Expert-based" rules.

Number of observations in ReSurvey from the habitats of interest:

```{r}
nrow(db_resurv_RS_short_PLOT %>%
       filter(EUNISa_1 %in% c("T", "R", "S", "Q")))
```

Number of observations in ReSurvey from the habitats of interest and with all RS data:

```{r}
nrow(db_resurv_RS_short_PLOT %>%
       filter(EUNISa_1 %in% c("T", "R", "S", "Q")) %>%
       filter(CH_data == T) %>%
       filter(S2_data == T | Landsat_data ==T) %>%
       filter(S2_phen_data == T))
```

```{r}
db_resurv_RS_short_PLOT_terrestrial <- db_resurv_RS_short_PLOT %>%
  filter(EUNISa_1 %in% c("T", "R", "S", "Q"))
```

## Define rules

Create column for first validation based on different indicators, where "wrong" is noted when the validation rule is not met. Include EUNIS1 confusions.

```{r}
db_resurv_RS_short_PLOT_terrestrial %>% count(EUNISa_1, EUNIS1_conf_type)
```

Define rules:

```{r}
db_resurv_RS_short_PLOT_terrestrial <-
  db_resurv_RS_short_PLOT_terrestrial %>%
  mutate(
    valid_1_NDWI = case_when(
      # Points that are basically water
      NDWI_max > 0.3 ~ "wrong",
      TRUE ~ NA_character_),
    valid_1_CH = case_when(
      # T points with low CH
      EUNISa_1 == "T" & canopy_height < 8 ~ "wrong",
      # S points with low CH
      EUNISa_1 =="S" & canopy_height < 5 ~ "wrong",
      # R & Q points with high CH
      EUNISa_1 %in% c("R", "Q") & canopy_height > 2 ~ "wrong",
      TRUE ~ NA_character_),
    valid_1_NDVI = case_when(
      # T points with low NDVI_max
      EUNISa_1 == "T" & NDVI_max < 0.6 ~ "wrong",
      # S-R-Q points with low NDVI_max
      EUNISa_1 %in% c("R", "S", "Q") & NDVI_max < 0.2 ~ "wrong",
      TRUE ~ NA_character_),
    # Count how many validation rules are not met
    valid_1_count = rowSums(across(c(valid_1_NDWI, valid_1_CH, valid_1_NDVI), 
                             ~ . == "wrong"), na.rm = TRUE),
    # Points where at least 1 rule not met
    valid_1 = if_else(valid_1_count > 0, "At least 1 rule broken",
                      "No rules broken so far")
    )
```

## Plots first validation

```{r}
ggplot(db_resurv_RS_short_PLOT_terrestrial%>%
         mutate(rules_broken = case_when(
           valid_1_count == 1 & valid_1_NDWI == "wrong" ~ "NDWI",
           valid_1_count == 1 & valid_1_NDVI == "wrong" ~ "NDVI",
           valid_1_count == 1 & valid_1_CH == "wrong" ~ "CH",
           valid_1_count == 2 &
             valid_1_NDWI == "wrong" & valid_1_NDVI == "wrong"~ "NDWI + NDVI",
           valid_1_count == 2 &
             valid_1_NDWI == "wrong" & valid_1_CH == "wrong"~ "NDWI + CH",
           valid_1_count == 2 &
             valid_1_NDVI == "wrong" & valid_1_CH == "wrong"~ "NDVI + CH",
           valid_1_count == 3 ~ "NDWI + NDVI + CH",
           TRUE ~ NA_character_
         )), 
       aes(x = valid_1_count, fill = rules_broken)) +
  geom_bar() + labs(x = "Number of broken rules")
```

```{r}
db_resurv_RS_short_PLOT_terrestrial %>%
         mutate(rules_broken = case_when(
           valid_1_count == 1 & valid_1_NDWI == "wrong" ~ "NDWI",
           valid_1_count == 1 & valid_1_NDVI == "wrong" ~ "NDVI",
           valid_1_count == 1 & valid_1_CH == "wrong" ~ "CH",
           valid_1_count == 2 &
             valid_1_NDWI == "wrong" & valid_1_NDVI == "wrong"~ "NDWI + NDVI",
           valid_1_count == 2 &
             valid_1_NDWI == "wrong" & valid_1_CH == "wrong"~ "NDWI + CH",
           valid_1_count == 2 &
             valid_1_NDVI == "wrong" & valid_1_CH == "wrong"~ "NDVI + CH",
           valid_1_count == 3 ~ "NDWI + NDVI + CH",
           TRUE ~ NA_character_
         )) %>%
  count(rules_broken, EUNIS1_conf_type)
```

Proportion of observations not validated (so far):

```{r}
nrow(db_resurv_RS_short_PLOT_terrestrial %>% filter(valid_1_count > 0))/
  nrow(db_resurv_RS_short_PLOT_terrestrial)
```

But be aware that there are still many missing RS data, e.g. Landsat data for bioregion CON (many points).

```{r}
ggplot(db_resurv_RS_short_PLOT_terrestrial %>%
         mutate(diff_GPS = if_else(
           `Location method` != "Location with differential GPS" |
             is.na(`Location method`), "no", "yes")), 
       aes(x = diff_GPS, fill = valid_1)) +
  geom_bar() + labs(x = "Differential GPS")
ggplot(db_resurv_RS_short_PLOT_terrestrial %>%
         mutate(GPS = case_when(
           `Location method` == "Location with differential GPS" ~ "yes",
           `Location method` == "Location with GPS" ~ "yes",
           is.na(`Location method`) ~ "no",
           TRUE ~ "no"
         )), 
       aes(x = GPS, fill = valid_1)) +
  geom_bar() + labs(x = "GPS")
```

Points with any rule broken and confusion between EUNIS:

```{r}
nrow(db_resurv_RS_short_PLOT_terrestrial %>%
       filter(EUNIS1_conf == T & valid_1_count > 0))
```

Convert to shp to look at these in GIS:

```{r}
# st_write(db_resurv_RS_short_PLOT_terrestrial %>%
#            filter(EUNIS1_conf == T & valid_1_count > 0) %>%
#            st_as_sf(coords = c("Lon_updated", "Lat_updated"), crs = 4326),
#          "C:/GIS/MOTIVATE/shapefiles/resurv_not_val_EUNIS_conf.shp")
```

Checked and yes

How many points with differential GPS that have at least 1 rule broken?

```{r}
nrow(db_resurv_RS_short_PLOT_terrestrial %>%
  filter(`Location method` == "Location with differential GPS" &
           valid_1 == "At least 1 rule broken"))
```

Convert to shp to look at these in GIS:

```{r}
# st_write(db_resurv_RS_short_PLOT_terrestrial %>%
#            filter(`Location method` == "Location with differential GPS" &
#                     valid_1 == "At least 1 rule broken") %>%
#            st_as_sf(coords = c("Lon_updated", "Lat_updated"), crs = 4326),
#          "C:/GIS/MOTIVATE/shapefiles/resurv_not_val_diff_GPS.shp")
```

# Distributions from diff GPS points without rules broken so far

Create tibble with differential GPS points without rules broken so far:

```{r}
diff_GPS_valid <- db_resurv_RS_short_PLOT_terrestrial %>%
  filter(`Location method` == "Location with differential GPS" &
           valid_1 == "No rules broken so far") 
```

## Max. and min. NDVI, NDMI, NDWI, SAVI and EVI

```{r}
plot_distr_NDVI_max_diff_GPS_valid <- distr_plot(diff_GPS_valid,
                                             "NDVI_max", "NDVI max")
plot_distr_NDMI_max_diff_GPS_valid <- distr_plot(diff_GPS_valid,
                                             "NDMI_max", "NDMI max")
plot_distr_NDWI_max_diff_GPS_valid <- distr_plot(diff_GPS_valid,
                                             "NDWI_max", "NDWI max")
plot_distr_SAVI_max_diff_GPS_valid <- distr_plot(diff_GPS_valid,
                                             "SAVI_max", "SAVI max")
plot_distr_EVI_max_diff_GPS_valid <- distr_plot(diff_GPS_valid %>%
                                              filter(EVI_max <= 1),
                                            "EVI_max", "EVI max")
plot_distr_NDVI_max_diff_GPS_valid
plot_distr_NDMI_max_diff_GPS_valid
plot_distr_NDWI_max_diff_GPS_valid
plot_distr_SAVI_max_diff_GPS_valid
plot_distr_EVI_max_diff_GPS_valid
```

```{r}
plot_distr_NDVI_min_diff_GPS_valid <- distr_plot(diff_GPS_valid,
                                             "NDVI_min", "NDVI min")
plot_distr_NDMI_min_diff_GPS_valid <- distr_plot(diff_GPS_valid,
                                             "NDMI_min", "NDMI min")
plot_distr_NDWI_min_diff_GPS_valid <- distr_plot(diff_GPS_valid,
                                             "NDWI_min", "NDWI min")
plot_distr_SAVI_min_diff_GPS_valid <- distr_plot(diff_GPS_valid,
                                             "SAVI_min", "SAVI min")
plot_distr_EVI_min_diff_GPS_valid <- distr_plot(diff_GPS_valid %>%
                                                  # Some values wrong!
                                                  # EVI should not be lower than -1
                                                  filter(EVI_min >= -1),
                                                "EVI_min", "EVI min")
plot_distr_NDVI_min_diff_GPS_valid
plot_distr_NDMI_min_diff_GPS_valid
plot_distr_NDWI_min_diff_GPS_valid
plot_distr_SAVI_min_diff_GPS_valid
plot_distr_EVI_min_diff_GPS_valid
```

## CH

```{r}
plot_distr_CH_diff_GPS_valid <- distr_plot(diff_GPS_valid, "canopy_height",
                                           "Canopy height (m)")
plot_distr_CH_diff_GPS_valid
```

## Phenology

```{r}
plot_distr_SOS_DOY_diff_GPS_valid <- distr_plot(diff_GPS_valid,
                                                "SOS_DOY", "SOS DOY")
plot_distr_Peak_DOY_diff_GPS_valid <- distr_plot(diff_GPS_valid,
                                                 "Peak_DOY", "Peak DOY")
plot_distr_EOS_DOY_diff_GPS_valid <- distr_plot(diff_GPS_valid,
                                                "EOS_DOY", "EOS DOY")
plot_distr_SOS_DOY_diff_GPS_valid
plot_distr_Peak_DOY_diff_GPS_valid
plot_distr_EOS_DOY_diff_GPS_valid
```

Not very conclusive...

# Distributions from GPS (diff or not) points without rules broken so far

Create tibble with differential GPS points without rules broken so far:

```{r}
all_GPS_valid <- db_resurv_RS_short_PLOT_terrestrial %>%
  filter((`Location method` == "Location with differential GPS" | 
            `Location method` == "Location with GPS" ) &
           valid_1 == "No rules broken so far") 
```

## Max. and min. NDVI, NDMI, NDWI, SAVI and EVI

```{r}
plot_distr_NDVI_max_all_GPS_valid <- distr_plot(all_GPS_valid,
                                             "NDVI_max", "NDVI max")
plot_distr_NDMI_max_all_GPS_valid <- distr_plot(all_GPS_valid,
                                             "NDMI_max", "NDMI max")
plot_distr_NDWI_max_all_GPS_valid <- distr_plot(all_GPS_valid,
                                             "NDWI_max", "NDWI max")
plot_distr_SAVI_max_all_GPS_valid <- distr_plot(all_GPS_valid,
                                             "SAVI_max", "SAVI max")
plot_distr_EVI_max_all_GPS_valid <- distr_plot(all_GPS_valid %>%
                                              filter(EVI_max <= 1),
                                            "EVI_max", "EVI max")
plot_distr_NDVI_max_all_GPS_valid
plot_distr_NDMI_max_all_GPS_valid
plot_distr_NDWI_max_all_GPS_valid
plot_distr_SAVI_max_all_GPS_valid
plot_distr_EVI_max_all_GPS_valid
```

```{r}
plot_distr_NDVI_min_all_GPS_valid <- distr_plot(all_GPS_valid,
                                             "NDVI_min", "NDVI min")
plot_distr_NDMI_min_all_GPS_valid <- distr_plot(all_GPS_valid,
                                             "NDMI_min", "NDMI min")
plot_distr_NDWI_min_all_GPS_valid <- distr_plot(all_GPS_valid,
                                             "NDWI_min", "NDWI min")
plot_distr_SAVI_min_all_GPS_valid <- distr_plot(all_GPS_valid,
                                             "SAVI_min", "SAVI min")
plot_distr_EVI_min_all_GPS_valid <- distr_plot(all_GPS_valid %>%
                                                  # Some values wrong!
                                                  # EVI should not be lower than -1
                                                  filter(EVI_min >= -1),
                                                "EVI_min", "EVI min")
plot_distr_NDVI_min_all_GPS_valid
plot_distr_NDMI_min_all_GPS_valid
plot_distr_NDWI_min_all_GPS_valid
plot_distr_SAVI_min_all_GPS_valid
plot_distr_EVI_min_all_GPS_valid
```

## CH

```{r}
plot_distr_CH_all_GPS_valid <- distr_plot(all_GPS_valid, "canopy_height",
                                           "Canopy height (m)")
plot_distr_CH_all_GPS_valid
```

## Phenology

```{r}
plot_distr_SOS_DOY_all_GPS_valid <- distr_plot(all_GPS_valid,
                                                "SOS_DOY", "SOS DOY")
plot_distr_Peak_DOY_all_GPS_valid <- distr_plot(all_GPS_valid,
                                                 "Peak_DOY", "Peak DOY")
plot_distr_EOS_DOY_all_GPS_valid <- distr_plot(all_GPS_valid,
                                                "EOS_DOY", "EOS DOY")
plot_distr_SOS_DOY_all_GPS_valid
plot_distr_Peak_DOY_all_GPS_valid
plot_distr_EOS_DOY_all_GPS_valid
```

# Combined plot

```{r}
grid.arrange(
  plot_distr_NDVI_max_diff_GPS_valid, plot_distr_NDVI_max_all_GPS_valid,
  plot_distr_NDMI_min_diff_GPS_valid, plot_distr_NDMI_min_all_GPS_valid,
  ncol = 2)
```

# Diff GPS valid points above p20 of NDVI_max and NDMI_min for each habitat

```{r}
percentiles_diff_GPS <- diff_GPS_valid %>%
  group_by(EUNISa_1) %>%
  summarize(percentile_20_NDVI_max = quantile(NDVI_max, probs = 0.20, na.rm = T),
            percentile_20_NDMI_min = quantile(NDMI_min, probs = 0.20, na.rm = T))

diff_GPS_valid <- diff_GPS_valid %>%
  left_join(percentiles_diff_GPS, by = "EUNISa_1") %>%
  mutate(category_NDVI_max = case_when(
    NDVI_max < percentile_20_NDVI_max ~ "below_20th",
    NDVI_max >= percentile_20_NDVI_max ~ "above_20th"),
  category_NDMI_min = case_when(
    NDMI_min < percentile_20_NDMI_min ~ "below_20th",
    NDMI_min >= percentile_20_NDMI_min ~ "above_20th"))

ggplot(data = diff_GPS_valid,
       aes(x = EUNISa_1_descr, y = NDVI_max)) +
  geom_flat_violin(position = position_nudge(x = 0.2, y = 0), alpha = 0.8,
                   fill = "lightblue") +
  geom_point(aes(color = category_NDVI_max),
             position = position_jitter(width = 0.15), size = 1, alpha = 0.25) +
  geom_boxplot(width = 0.2, outlier.shape = NA, alpha = 0.5) +
  stat_summary(fun.y = mean, geom = "point", shape = 20, size = 1) +
  stat_summary(fun.data = function(x) data.frame(y = max(x) + 0.1, label = length(x)),
               geom = "text", aes(label = ..label..), vjust = 0.5) +
  labs(y = "NDVI max", x = "EUNIS level 1") +
  guides(fill = FALSE, color = FALSE) +
  scale_x_discrete(labels = function(x) str_wrap(x, width = 15)) +
  scale_color_manual(values = c("below_20th" = "grey", "above_20th" = "lightblue")) +
  theme_bw() + coord_flip()

ggplot(data = diff_GPS_valid,
       aes(x = EUNISa_1_descr, y = NDMI_min)) +
  geom_flat_violin(position = position_nudge(x = 0.2, y = 0), alpha = 0.8,
                   fill = "lightblue") +
  geom_point(aes(color = category_NDMI_min),
             position = position_jitter(width = 0.15), size = 1, alpha = 0.25) +
  geom_boxplot(width = 0.2, outlier.shape = NA, alpha = 0.5) +
  stat_summary(fun.y = mean, geom = "point", shape = 20, size = 1) +
  stat_summary(fun.data = function(x) data.frame(y = max(x) + 0.1, label = length(x)),
               geom = "text", aes(label = ..label..), vjust = 0.5) +
  labs(y = "NDMI min", x = "EUNIS level 1") +
  guides(fill = FALSE, color = FALSE) +
  scale_x_discrete(labels = function(x) str_wrap(x, width = 15)) +
  scale_color_manual(values = c("below_20th" = "grey", "above_20th" = "lightblue")) +
  theme_bw() + coord_flip()
```

# GPS (diff or not) valid points above p20 of NDVI_max and NDMI_min for each habitat

```{r}
percentiles_all_GPS <- all_GPS_valid %>%
  group_by(EUNISa_1) %>%
  summarize(percentile_20_NDVI_max = quantile(NDVI_max, probs = 0.20, na.rm = T),
            percentile_20_NDMI_min = quantile(NDMI_min, probs = 0.20, na.rm = T))

all_GPS_valid <- all_GPS_valid %>%
  left_join(percentiles_all_GPS, by = "EUNISa_1") %>%
  mutate(category_NDVI_max = case_when(
    NDVI_max < percentile_20_NDVI_max ~ "below_20th",
    NDVI_max >= percentile_20_NDVI_max ~ "above_20th"),
  category_NDMI_min = case_when(
    NDMI_min < percentile_20_NDMI_min ~ "below_20th",
    NDMI_min >= percentile_20_NDMI_min ~ "above_20th"))

ggplot(data = all_GPS_valid,
       aes(x = EUNISa_1_descr, y = NDVI_max)) +
  geom_flat_violin(position = position_nudge(x = 0.2, y = 0), alpha = 0.8,
                   fill = "lightblue") +
  geom_point(aes(color = category_NDVI_max),
             position = position_jitter(width = 0.15), size = 1, alpha = 0.25) +
  geom_boxplot(width = 0.2, outlier.shape = NA, alpha = 0.5) +
  stat_summary(fun.y = mean, geom = "point", shape = 20, size = 1) +
  stat_summary(fun.data = function(x) data.frame(y = max(x) + 0.1, label = length(x)),
               geom = "text", aes(label = ..label..), vjust = 0.5) +
  labs(y = "NDVI max", x = "EUNIS level 1") +
  guides(fill = FALSE, color = FALSE) +
  scale_x_discrete(labels = function(x) str_wrap(x, width = 15)) +
  scale_color_manual(values = c("below_20th" = "grey", "above_20th" = "lightblue")) +
  theme_bw() + coord_flip()

ggplot(data = all_GPS_valid,
       aes(x = EUNISa_1_descr, y = NDMI_min)) +
  geom_flat_violin(position = position_nudge(x = 0.2, y = 0), alpha = 0.8,
                   fill = "lightblue") +
  geom_point(aes(color = category_NDMI_min),
             position = position_jitter(width = 0.15), size = 1, alpha = 0.25) +
  geom_boxplot(width = 0.2, outlier.shape = NA, alpha = 0.5) +
  stat_summary(fun.y = mean, geom = "point", shape = 20, size = 1) +
  stat_summary(fun.data = function(x) data.frame(y = max(x) + 0.1, label = length(x)),
               geom = "text", aes(label = ..label..), vjust = 0.5) +
  labs(y = "NDMI min", x = "EUNIS level 1") +
  guides(fill = FALSE, color = FALSE) +
  scale_x_discrete(labels = function(x) str_wrap(x, width = 15)) +
  scale_color_manual(values = c("below_20th" = "grey", "above_20th" = "lightblue")) +
  theme_bw() + coord_flip()
```

# RF models

Using the conditional inference version of random forest (cforest in package party). Suggested if the data are highly correlated. Cforest is more stable in deriving variable importance values in the presence of highly correlated variables, thus providing better accuracy in calculating variable importance (ref below).

Hothorn, T., Hornik, K. and Zeileis, A. (2006) Unbiased Recursive Portioning: A Conditional Inference Framework. Journal of Computational and Graphical Statistics, 15, 651-
674. http://dx.doi.org/10.1198/106186006X133933

## All GPS points above p20

Filter the data to get only GPS-points above p20 of NDVI_max and NDMI_min.

```{r}
all_GPS_valid <- all_GPS_valid %>%
  select(-percentile_20_NDVI_max, -percentile_20_NDMI_min)
```

```{r}
percentiles <- all_GPS_valid %>%
  group_by(EUNISa_1) %>%
  summarize(
    percentile_20_NDVI_max = quantile(NDVI_max, 0.20, na.rm = T),
    percentile_20_NDMI_min = quantile(NDMI_min, 0.20, na.rm = T),
    percentile_80_NDVI_max = quantile(NDVI_max, 0.80, na.rm = T),
    percentile_80_NDMI_min = quantile(NDMI_min, 0.80, na.rm = T)
    )

# Join the percentiles back to the original data
all_GPS_valid <- all_GPS_valid %>%
  left_join(percentiles, by = "EUNISa_1")

# Filter rows above the 20th percentile for both variables for each category of EUNISa_1
filtered_data1 <- all_GPS_valid %>%
  filter(
    NDVI_max >= percentile_20_NDVI_max & NDMI_min >= percentile_20_NDMI_min
    ) %>%
  filter(!is.na(NDVI_max) & !is.na(NDMI_max) & !is.na(NDWI_max) &
           !is.na(SAVI_max) & !is.na(EVI_max) & !is.na(NDVI_min) &
           !is.na(NDMI_min) & !is.na(NDWI_min) & !is.na(SAVI_min) &
           !is.na(EVI_min)) %>%
  mutate(EUNISa_1 = as.factor(EUNISa_1)) %>%
  filter(EVI_max <= 1 & EVI_min >= -1)
```

Split into training and test data sets.

```{r}
train_indices1 <- sample(1:nrow(filtered_data1), 0.7 * nrow(filtered_data1))
train_data1 <- filtered_data1[train_indices1, ]
test_data1 <- filtered_data1[-train_indices1, ]
```

Number of points per category for filtered data:

```{r}
filtered_data1 %>% count(EUNISa_1)
```

Investigate package ggparty (e.g. autoplot function, and more).

TO-DO: 
Choose the hyperparameter mtry based on the square root of the number of predictor variables (Hastie et al., 2009)-

Hastie, T., Tibshirani, R., & Friedman, J. (2009). The elements of statistical
learning: Data mining, inference, and prediction. Springer Science &
Business Media.

Maybe TO_DO:
We variated ntree from 50 to 800 in steps of 50, leaving mtry constant at 2. Tis parameter variation showed that ntree=500 was optimal, while higher ntree led to no further model improvement (Supplementary Fig. S10). Subsequently, the hyperparameter mtry was varied from 2 to 8 with constant ntree=500. Here, mtry=3 led to the best results in almost all cases (Supplementary Fig. S11). Consequently, we chose ntree=500 and mtry=3 for our main analysis across all study sites.

```{r}
rf_cforest1 <- party::cforest(EUNISa_1 ~ NDVI_max + NDVI_min + NDMI_max +
                                NDMI_min + NDWI_max + NDWI_min + EVI_max +
                                EVI_min + SAVI_max + SAVI_min + canopy_height, 
                              data = train_data1,
                              controls = cforest_control(
                                mtry = 3,
                                # mtry = sqrt(11)
                                # Default mtry = 5
                                # Bagging: mtry = NULL
                                # or = number of input variables
                                ntree = 500) # Default, try increasing
                              ) 
```

```{r}
predictions_rf_cforest1 <- predict(rf_cforest1, newdata = test_data1,
                                   OOB = TRUE, type = "response")
```

Confusion matrix:

```{r}
confusionMatrix(predictions_rf_cforest1, test_data1$EUNISa_1)
```

SurrogateTree --> does not work

```{r}
varimp_rf_cforest1 <- party::varimp(rf_cforest1, conditional = F) 
# conditional = T adjusts for correlations between predictor variables
# Takes long!
```

```{r}
# party::varimp(rf_cforest1, conditional = T) 
# conditional = T adjusts for correlations between predictor variables
# Takes long!
```

Variable Importance Plot

```{r}
varimp_rf_cforest1_df <- data.frame(Variable = names(varimp_rf_cforest1),
                                    Importance = varimp_rf_cforest1)
ggplot(varimp_rf_cforest1_df,
       aes(x = reorder(Variable, Importance), y = Importance)) +
  geom_bar(stat = "identity", fill = "lightblue") +
  coord_flip() + theme_minimal() +
  labs(title = "Variable Importance", x = "Variables", y = "Importance")
```

Tree Visualization

```{r}
# Create a single conditional inference tree using ctree
single_tree1 <- ctree(EUNISa_1 ~ NDVI_max + NDVI_min + NDMI_max + NDMI_min +
                       NDWI_max + NDWI_min + EVI_max + EVI_min + SAVI_max +
                       SAVI_min + canopy_height,
                     data = train_data1)

# Plot the single tree using
autoplot(single_tree1)
```

## All GPS points within IQ range

Filter the data to get only GPS-points within IQ range of NDVI_max and NDMI_min.

```{r}
IQ_ranges <- all_GPS_valid %>%
  group_by(EUNISa_1) %>%
  summarize(
    Q1_NDVI_max = quantile(NDVI_max, 0.25, na.rm = T),
    Q1_NDMI_min = quantile(NDMI_min, 0.25, na.rm = T),
    Q3_NDVI_max = quantile(NDVI_max, 0.75, na.rm = T),
    Q3_NDMI_min = quantile(NDMI_min, 0.75, na.rm = T)
    )

# Join the IQ ranges back to the original data
all_GPS_valid <- all_GPS_valid %>%
  left_join(IQ_ranges, by = "EUNISa_1")

# Filter rows within the IQR range for both variables
filtered_data2 <- all_GPS_valid %>%
  filter(
    (NDVI_max >= Q1_NDVI_max & NDVI_max <= Q3_NDVI_max) &
    (NDMI_min >= Q1_NDMI_min & NDMI_min <= Q3_NDMI_min)
    ) %>%
  filter(!is.na(NDVI_max) & !is.na(NDMI_max) & !is.na(NDWI_max) &
           !is.na(SAVI_max) & !is.na(EVI_max) & !is.na(NDVI_min) &
           !is.na(NDMI_min) & !is.na(NDWI_min) & !is.na(SAVI_min) &
           !is.na(EVI_min)) %>%
  mutate(EUNISa_1 = as.factor(EUNISa_1)) %>%
  filter(EVI_max <= 1 & EVI_min >= -1)
```

Split into training and test data sets.

```{r}
train_indices2 <- sample(1:nrow(filtered_data2), 0.7 * nrow(filtered_data2))
train_data2 <- filtered_data2[train_indices2, ]
test_data2 <- filtered_data2[-train_indices2, ]
```

Number of points per category for filtered data:

```{r}
filtered_data2 %>% count(EUNISa_1)
```

```{r}
rf_cforest2 <- party::cforest(EUNISa_1 ~ NDVI_max + NDVI_min + NDMI_max +
                                NDMI_min + NDWI_max + NDWI_min + EVI_max +
                                EVI_min + SAVI_max + SAVI_min + canopy_height, 
                              data = train_data2,
                              controls = cforest_control(
                                mtry = 3,
                                # mtry = sqrt(11)
                                # Default mtry = 5
                                # Bagging: mtry = NULL
                                # or = number of input variables
                                ntree = 500) # Default, try increasing
                              ) 
```

```{r}
predictions_rf_cforest2 <- predict(rf_cforest2, newdata = test_data2,
                                   OOB = TRUE, type = "response")
```

Confusion matrix:

```{r}
confusionMatrix(predictions_rf_cforest2, test_data2$EUNISa_1)
```

```{r}
varimp_rf_cforest2 <- party::varimp(rf_cforest2, conditional = F) 
```

Variable Importance Plot

```{r}
varimp_rf_cforest2_df <- data.frame(Variable = names(varimp_rf_cforest2),
                                    Importance = varimp_rf_cforest2)
ggplot(varimp_rf_cforest2_df,
       aes(x = reorder(Variable, Importance), y = Importance)) +
  geom_bar(stat = "identity", fill = "lightblue") +
  coord_flip() + theme_minimal() +
  labs(title = "Variable Importance", x = "Variables", y = "Importance")
```

## All GPS points within mean +/- SD

Filter the data to get only GPS-points within mean +/- SD of NDVI_max and NDMI_min.

```{r}
mean_sd <- all_GPS_valid %>%
  group_by(EUNISa_1) %>%
  summarize(
    mean_NDVI_max = mean(all_GPS_valid$NDVI_max, na.rm = T),
    mean_NDMI_min = mean(all_GPS_valid$NDMI_min, na.rm = T),
    sd_NDVI_max = sd(all_GPS_valid$NDVI_max, na.rm = T),
    sd_NDMI_min = sd(all_GPS_valid$NDMI_min, na.rm = T)
    )

# Join the IQ ranges back to the original data
all_GPS_valid <- all_GPS_valid %>%
  left_join(mean_sd, by = "EUNISa_1")

# Filter rows within the specified range for both variables
filtered_data3 <- all_GPS_valid %>%
  filter(
    (NDVI_max >= (mean_NDVI_max - sd_NDVI_max) & NDVI_max <= (mean_NDVI_max + sd_NDVI_max)) &
      (NDMI_min >= (mean_NDMI_min - sd_NDMI_min) & NDMI_min <= (mean_NDMI_min + sd_NDMI_min))
    ) %>%
  filter(!is.na(NDVI_max) & !is.na(NDMI_max) & !is.na(NDWI_max) &
           !is.na(SAVI_max) & !is.na(EVI_max) & !is.na(NDVI_min) &
           !is.na(NDMI_min) & !is.na(NDWI_min) & !is.na(SAVI_min) &
           !is.na(EVI_min)) %>%
  mutate(EUNISa_1 = as.factor(EUNISa_1)) %>%
  filter(EVI_max <= 1 & EVI_min >= -1)
```

Split into training and test data sets.

```{r}
train_indices3 <- sample(1:nrow(filtered_data3), 0.7 * nrow(filtered_data3))
train_data3 <- filtered_data3[train_indices3, ]
test_data3 <- filtered_data3[-train_indices3, ]
```

Number of points per category for filtered data:

```{r}
filtered_data3 %>% count(EUNISa_1)
```

```{r}
rf_cforest3 <- party::cforest(EUNISa_1 ~ NDVI_max + NDVI_min + NDMI_max +
                                NDMI_min + NDWI_max + NDWI_min + EVI_max +
                                EVI_min + SAVI_max + SAVI_min + canopy_height, 
                              data = train_data3,
                              controls = cforest_control(
                                mtry = 3,
                                # mtry = sqrt(11)
                                # Default mtry = 5
                                # Bagging: mtry = NULL
                                # or = number of input variables
                                ntree = 500) # Default, try increasing
                              ) 
```

```{r}
predictions_rf_cforest3 <- predict(rf_cforest3, newdata = test_data3,
                                   OOB = TRUE, type = "response")
```

Confusion matrix:

```{r}
confusionMatrix(predictions_rf_cforest3, test_data3$EUNISa_1)
```

```{r}
varimp_rf_cforest3 <- party::varimp(rf_cforest3, conditional = F) 
```

Variable Importance Plot

```{r}
varimp_rf_cforest3_df <- data.frame(Variable = names(varimp_rf_cforest3),
                                    Importance = varimp_rf_cforest3)
ggplot(varimp_rf_cforest3_df,
       aes(x = reorder(Variable, Importance), y = Importance)) +
  geom_bar(stat = "identity", fill = "lightblue") +
  coord_flip() + theme_minimal() +
  labs(title = "Variable Importance", x = "Variables", y = "Importance")
```

## All GPS points above p20 and below p80

Filter the data to get only GPS-points above p20 and below p80 of NDVI_max and NDMI_min.

```{r}
# Filter rows above the 20th percentile and below the 80th percentile for both variables
filtered_data4 <- all_GPS_valid %>%
  filter(
    (NDVI_max >= percentile_20_NDVI_max & NDVI_max <= percentile_80_NDVI_max) &
    (NDMI_min >= percentile_20_NDMI_min & NDMI_min <= percentile_80_NDMI_min)
    ) %>%
  filter(!is.na(NDVI_max) & !is.na(NDMI_max) & !is.na(NDWI_max) &
           !is.na(SAVI_max) & !is.na(EVI_max) & !is.na(NDVI_min) &
           !is.na(NDMI_min) & !is.na(NDWI_min) & !is.na(SAVI_min) &
           !is.na(EVI_min)) %>%
  mutate(EUNISa_1 = as.factor(EUNISa_1)) %>%
  filter(EVI_max <= 1 & EVI_min >= -1)
```

Split into training and test data sets.

```{r}
train_indices4 <- sample(1:nrow(filtered_data4), 0.7 * nrow(filtered_data4))
train_data4 <- filtered_data4[train_indices4, ]
test_data4 <- filtered_data4[-train_indices4, ]
```

Number of points per category for filtered data:

```{r}
filtered_data4 %>% count(EUNISa_1)
```

```{r}
rf_cforest4 <- party::cforest(EUNISa_1 ~ NDVI_max + NDVI_min + NDMI_max +
                                NDMI_min + NDWI_max + NDWI_min + EVI_max +
                                EVI_min + SAVI_max + SAVI_min + canopy_height, 
                              data = train_data4,
                              controls = cforest_control(
                                mtry = 3,
                                # mtry = sqrt(11)
                                # Default mtry = 5
                                # Bagging: mtry = NULL
                                # or = number of input variables
                                ntree = 500) # Default, try increasing
                              ) 
```

```{r}
predictions_rf_cforest4 <- predict(rf_cforest4, newdata = test_data4,
                                   OOB = TRUE, type = "response")
```

Confusion matrix:

```{r}
confusionMatrix(predictions_rf_cforest4, test_data4$EUNISa_1)
```

# HERE: Compare RF 1-4

# Cordillera data

```{r}
AlpineGrasslands_indices <- read_csv(
  "C:/Data/MOTIVATE/Cordillera/AlpineGrasslands/AlpineGrassland_Sentinel_Plot_Allyear_Allmetrics.csv")
AlpineGrasslands_phen <- read_csv(
  "C:/Data/MOTIVATE/Cordillera/AlpineGrasslands/AlpineGrasslands_Phenology_SOS_EOS_Peak_NDVI_Amplitude.csv")
AlpineGrasslands_CH <- read_csv(
  "C:/Data/MOTIVATE/Cordillera/AlpineGrasslands/AlpineGrasslands_CanopyHeight_1m.csv")
VegetationTypes_indices <- read_csv(
  "C:/Data/MOTIVATE/Cordillera/VegetationTypes/VegetationTypes_Sentinel_Plot_AllYear_Allmetrics.csv")
VegetationTypes_phen <- read_csv(
  "C:/Data/MOTIVATE/Cordillera/VegetationTypes/VegetationTypes_Phenology_SOS_EOS_Peak_NDVI_Amplitude.csv")
VegetationTypes_CH <- read_csv(
  "C:/Data/MOTIVATE/Cordillera/VegetationTypes/VegetationTypes_CanopyHeight_1m.csv")
```

```{r}
AlpineGrasslands <- AlpineGrasslands_indices %>%
  select(-`system:index`, -.geo, -Localidad) %>%
  rename(Hábitat = "H�bitat") %>% 
  full_join(AlpineGrasslands_phen  %>%
              select(-`system:index`, -.geo, -Localidad) %>%
              rename(Hábitat = "H�bitat")) %>%
  full_join(AlpineGrasslands_CH  %>%
              select(-`system:index`, -.geo, -Localidad)) %>%
  select(-Date__year, - `Precisi�n`) %>%
  mutate(DATE = ymd(DATE)) %>%
  rename(ID = "Releve_num") %>%
  mutate(ID = as.character(ID)) %>%
  mutate(layer = "AlpineGrasslands")
```

```{r}
VegetationTypes <- VegetationTypes_indices %>%
  select(-`system:index`, -.geo) %>%
  full_join(VegetationTypes_phen  %>%
              select(-`system:index`, -.geo)) %>%
  full_join(VegetationTypes_CH  %>%
              select(-`system:index`, -.geo)) %>%
  rename(Hábitat = "TYPE") %>%
  mutate(layer = "VegetationTypes")
```

Merge both datasets:

```{r}
cordillera <- bind_rows(
  AlpineGrasslands %>% select(DATE, ID, starts_with("NDMI"),
                              starts_with("NDVI"), Hábitat, "EOS_DOY",
                              "Peak_DOY", "SOS_DOY", "Season_Length",
                              "canopy_height", "layer"),
  VegetationTypes %>% select(DATE, ID, starts_with("NDMI"),
                              starts_with("NDVI"), Hábitat, "EOS_DOY",
                              "Peak_DOY", "SOS_DOY", "Season_Length",
                              "canopy_height", "layer")
  ) %>%
  mutate(EUNISa_1 = case_when(
    Hábitat = str_detect(Hábitat, "Pastizal|Cervunal|grassland|meadow") ~ "R",
    Hábitat = str_detect(Hábitat, "forest") ~ "T",
    Hábitat = str_detect(Hábitat, "Scrub|scrub|Shrubland|shrubland|shrub|Heathland") ~ "S",
    Hábitat = str_detect(Hábitat, "Suelo|Scree|scree|cliff") ~ "U",
    Hábitat = is.na(Hábitat) ~ "R",
    TRUE ~ NA_character_),
    EUNISa_1_descr = case_when(
      EUNISa_1 == "R" ~ "Grasslands",
      EUNISa_1 == "T" ~ "Forests and other wooded land",
      EUNISa_1 == "S" ~ "Heathlands, scrub and tundra",
      EUNISa_1 == "U" ~ "Inland habitats with no or little soil")
    )
```

## Max. and min. NDVI, NDMI

```{r}
plot_distr_NDVI_max_cordi <- distr_plot(cordillera,
                                             "NDVI_max", "NDVI max")
plot_distr_NDVI_min_cordi <- distr_plot(cordillera,
                                             "NDVI_min", "NDVI min")
plot_distr_NDMI_max_cordi <- distr_plot(cordillera,
                                             "NDMI_max", "NDMI max")
plot_distr_NDMI_min_cordi <- distr_plot(cordillera,
                                             "NDMI_min", "NDMI min")
plot_distr_NDVI_max_cordi
plot_distr_NDVI_min_cordi
plot_distr_NDMI_max_cordi
plot_distr_NDMI_min_cordi
```

# OLD from here

### Plots NDVI and NDMI

```{r}
ggplot(data = cordi %>% filter(!is.na(NDVI_max)) %>%
         # Keep only forests, grasslands, shrublands and wetlands
         filter(EUNIS_1 %in% c("T", "R", "S", "Q")),
       aes(x = EUNIS_1, y = NDVI_max, fill = EUNIS_1)) +
  geom_flat_violin(position = position_nudge(x = 0.2, y = 0), alpha = 0.8) +
  geom_point(aes(y = NDVI_max, color = EUNIS_1),
             position = position_jitter(width = 0.15), size = 3,
             alpha = 0.5) +
  geom_boxplot(width = 0.2, outlier.shape = NA, alpha = 0.5) +
  stat_summary(fun.y=mean, geom="point", shape = 20, size = 3) +
  stat_summary(fun.data = function(x) data.frame(y = max(x) + 0.1,
                                                 label = length(x)),
               geom = "text", aes(label = ..label..), vjust = 0.5) +
  guides(fill = FALSE, color = FALSE) + theme_bw() + coord_flip()
ggsave(here("output", "figures", "cordi_NDVI_max.tiff"),
       width = 21, height = 29.7, units = "cm", dpi = 300)
ggplot(data = cordi %>% filter(!is.na(NDMI_max)) %>%
         # Keep only forests, grasslands, shrublands and wetlands
         filter(EUNIS_1 %in% c("T", "R", "S", "Q")),
       aes(x = EUNIS_1, y = NDMI_max, fill = EUNIS_1)) +
  geom_flat_violin(position = position_nudge(x = 0.2, y = 0), alpha = 0.8) +
  geom_point(aes(y = NDMI_max, color = EUNIS_1),
             position = position_jitter(width = 0.15), size = 3,
             alpha = 0.5) +
  geom_boxplot(width = 0.2, outlier.shape = NA, alpha = 0.5) +
  stat_summary(fun.y=mean, geom="point", shape = 20, size = 3) +
  stat_summary(fun.data = function(x) data.frame(y = max(x) + 0.1,
                                                 label = length(x)),
               geom = "text", aes(label = ..label..), vjust = 0.5) +
  guides(fill = FALSE, color = FALSE) + theme_bw() + coord_flip()
ggsave(here("output", "figures", "cordi_NDMI_max.tiff"),
       width = 21, height = 29.7, units = "cm", dpi = 300)
```

### Maps

```{r}
# Load world boundaries
world <- ne_countries(scale = "medium", returnclass = "sf")
```

```{r}
# Calculate the extent of the points
points_cordi_extent <- cordi %>% filter(EUNIS_1 %in% c("T", "R", "S", "Q")) %>%
  summarise(lon_min = min(longitude, na.rm = TRUE),
            lon_max = max(longitude, na.rm = TRUE),
            lat_min = min(latitude, na.rm = TRUE),
            lat_max = max(latitude, na.rm = TRUE))

# Add padding to the extent (adjust as needed)
padding <- 3  # Adjust padding to your preference
x_limits <- c(points_cordi_extent$lon_min - padding,
              points_cordi_extent$lon_max + padding)
y_limits <- c(points_cordi_extent$lat_min - padding,
              points_cordi_extent$lat_max + padding)

# Create the zoomed map
ggplot() +
  geom_sf(data = world, fill = "lightblue", color = "gray") +
  geom_point(data = cordi %>% filter(EUNIS_1 %in% c("T", "R", "S", "Q")),
             aes(x = longitude, y = latitude, color = EUNIS_1),
             size = 1) +
  coord_sf(xlim = x_limits, ylim = y_limits) +
  theme_minimal()
```

## Points differential GPS

### Maps

```{r}
# Calculate the extent of the points
points_GPS_extent <- db_resurv_RS_short_PLOT %>%
  filter(EUNISa_1 %in% c("T", "R", "S", "Q")) %>%
  filter(S2_data == T | Landsat_data == T ) %>%
  filter(`Location method` == "Location with differential GPS") %>%
  summarise(lon_min = min(Lon_updated, na.rm = TRUE),
            lon_max = max(Lon_updated, na.rm = TRUE),
            lat_min = min(Lat_updated, na.rm = TRUE),
            lat_max = max(Lat_updated, na.rm = TRUE))

# Add padding to the extent (adjust as needed)
padding <- 2  # Adjust padding to your preference
x_limits <- c(points_GPS_extent$lon_min - padding,
              points_GPS_extent$lon_max + padding)
y_limits <- c(points_GPS_extent$lat_min - padding,
              points_GPS_extent$lat_max + padding)

# Create the zoomed map
ggplot() +
  geom_sf(data = world, fill = "lightblue", color = "gray") +
  geom_point(data = db_resurv_RS_short_PLOT %>%
               filter(EUNISa_1 %in% c("T", "R", "S", "Q")) %>%
               filter(S2_data == T | Landsat_data == T ) %>%
               filter(`Location method` == "Location with differential GPS"),
             aes(x = Lon_updated, y = Lat_updated, color = EUNISa_1),
             size = 1) +
  coord_sf(xlim = x_limits, ylim = y_limits) +
  theme_minimal()
```

Number of differential GPS points by Country:

```{r}
db_resurv_RS_short_PLOT %>%
  filter(EUNISa_1 %in% c("T", "R", "S", "Q")) %>%
  filter(S2_data == T | Landsat_data == T ) %>%
  filter(`Location method` == "Location with differential GPS") %>%
  count(Country)
```

## Points ReSurvey

### Maps

```{r}
# Calculate the extent of the points
points_resurvey_extent <- db_resurv_RS_short_PLOT %>%
  filter(EUNISa_1 %in% c("T", "R", "S", "Q")) %>%
  filter(S2_data == T | Landsat_data == T ) %>%
  summarise(lon_min = min(Lon_updated, na.rm = TRUE),
            lon_max = max(Lon_updated, na.rm = TRUE),
            lat_min = min(Lat_updated, na.rm = TRUE),
            lat_max = max(Lat_updated, na.rm = TRUE))

# Add padding to the extent (adjust as needed)
padding <- 2  # Adjust padding to your preference
x_limits <- c(points_resurvey_extent$lon_min - padding,
              points_resurvey_extent$lon_max + padding)
y_limits <- c(points_resurvey_extent$lat_min - padding,
              points_resurvey_extent$lat_max + padding)

# Create the zoomed map
ggplot() +
  geom_sf(data = world, fill = "lightblue", color = "gray") +
  geom_point(data = db_resurv_RS_short_PLOT %>%
               filter(EUNISa_1 %in% c("T", "R", "S", "Q")) %>%
               filter(S2_data == T | Landsat_data == T ),
             aes(x = Lon_updated, y = Lat_updated, color = EUNISa_1),
             size = 1) +
  coord_sf(xlim = x_limits, ylim = y_limits) +
  theme_minimal()
```

Number of ReSurvey points by Country:

```{r}
db_resurv_RS_short_PLOT %>%
  filter(EUNISa_1 %in% c("T", "R", "S", "Q")) %>%
  filter(S2_data == T | Landsat_data == T ) %>%
  count(Country)
```

## Join

```{r}
points_halle <- bind_rows(
  cordi %>% filter(EUNIS_1 %in% c("T", "R", "S", "Q")) %>%
    select(longitude, latitude, NDVI_max, NDMI_max, EUNIS_1) %>%
    rename(EUNIS = EUNIS_1) %>%
    mutate(point_type = "Cordillera"),
  db_resurv_RS_short_PLOT %>%
    filter(EUNISa_1 %in% c("T", "R", "S", "Q")) %>%
    filter(S2_data == T | Landsat_data == T ) %>%
    filter(`Location method` == "Location with differential GPS") %>%
    select(Lon_updated, Lat_updated, NDVI_max, NDMI_max, EUNISa_1) %>%
    rename(longitude = Lon_updated, latitude = Lat_updated,
           EUNIS = EUNISa_1) %>%
    mutate(point_type = "ReSurvey differential GPS"),
  db_resurv_RS_short_PLOT %>%
    filter(EUNISa_1 %in% c("T", "R", "S", "Q")) %>%
    filter(S2_data == T | Landsat_data == T ) %>%
    select(Lon_updated, Lat_updated, NDVI_max, NDMI_max, EUNISa_1) %>%
    rename(longitude = Lon_updated, latitude = Lat_updated,
           EUNIS = EUNISa_1) %>%
    mutate(point_type = "ReSurvey")
)
```

```{r}
ggplot(data = points_halle,
       aes(x = point_type, y = NDVI_max, fill = point_type)) +
  geom_flat_violin(position = position_nudge(x = 0.2, y = 0), alpha = 0.8) +
  geom_point(aes(y = NDVI_max, color = point_type),
             position = position_jitter(width = 0.15), size = 3,
             alpha = 0.5) +
  geom_boxplot(width = 0.2, outlier.shape = NA, alpha = 0.5) +
  stat_summary(fun.y=mean, geom="point", shape = 20, size = 3) +
  stat_summary(fun.data = function(x) data.frame(y = max(x) + 0.1,
                                                 label = length(x)),
               geom = "text", aes(label = ..label..), vjust = 0.5) +
  guides(fill = FALSE, color = FALSE) + theme_bw() + coord_flip() +
  facet_wrap(~ EUNIS)
```

# Session info

```{r}
sessionInfo()
```

